From 9647027f55bda3f7b35ae74db1fa636cdebc4daf Mon Sep 17 00:00:00 2001 From: Andras Fuchs Date: Fri, 25 Nov 2022 09:48:22 +0100 Subject: [PATCH 01/24] Fix a typo --- src/Microsoft.ML.SearchSpace/Parameter.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Microsoft.ML.SearchSpace/Parameter.cs b/src/Microsoft.ML.SearchSpace/Parameter.cs index a2fb0406dd..58eeff0b6d 100644 --- a/src/Microsoft.ML.SearchSpace/Parameter.cs +++ b/src/Microsoft.ML.SearchSpace/Parameter.cs @@ -50,7 +50,7 @@ public enum ParameterType } /// - /// is used to save sweeping result from tuner and is used to restore mlnet pipeline from sweepable pipline. + /// is used to save sweeping result from tuner and is used to restore mlnet pipeline from sweepable pipeline. /// [JsonConverter(typeof(ParameterConverter))] public sealed class Parameter : IDictionary, IEquatable, IEqualityComparer From 269b1bdd93696cffcb79a6b3140bdb9f01bbb7ce Mon Sep 17 00:00:00 2001 From: Andras Fuchs Date: Fri, 25 Nov 2022 09:49:43 +0100 Subject: [PATCH 02/24] Fix trial cancellation bug --- .../Runner/SweepablePipelineRunner.cs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/Microsoft.ML.AutoML/AutoMLExperiment/Runner/SweepablePipelineRunner.cs b/src/Microsoft.ML.AutoML/AutoMLExperiment/Runner/SweepablePipelineRunner.cs index d42363bde0..e8334660b8 100644 --- a/src/Microsoft.ML.AutoML/AutoMLExperiment/Runner/SweepablePipelineRunner.cs +++ b/src/Microsoft.ML.AutoML/AutoMLExperiment/Runner/SweepablePipelineRunner.cs @@ -15,13 +15,14 @@ namespace Microsoft.ML.AutoML { - internal class SweepablePipelineRunner : ITrialRunner + public class SweepablePipelineRunner : ITrialRunner { private MLContext? _mLContext; private readonly IEvaluateMetricManager _metricManager; private readonly IDatasetManager _datasetManager; private readonly SweepablePipeline _pipeline; private readonly IChannel? _logger; + private CancellationTokenRegistration _ctRegistration; public SweepablePipelineRunner(MLContext context, SweepablePipeline pipeline, IEvaluateMetricManager metricManager, IDatasetManager datasetManager, IChannel? logger = null) { @@ -91,13 +92,12 @@ public Task RunAsync(TrialSettings settings, CancellationToken ct) { try { - using (var ctRegistration = ct.Register(() => + _ctRegistration = ct.Register(() => { _mLContext?.CancelExecution(); - })) - { - return Task.Run(() => Run(settings)); - } + }); + + return Task.Run(() => Run(settings)); } catch (Exception ex) when (ct.IsCancellationRequested) { From a2c578178dcb4212ddc3cdbb9d4e3c65b82f9252 Mon Sep 17 00:00:00 2001 From: Andras Fuchs Date: Fri, 25 Nov 2022 09:52:07 +0100 Subject: [PATCH 03/24] Move performance related properties to TrialPerformanceMetrics and add ReportTrialResourceUsage event to IMonitor --- .../NotebookMonitor.cs | 6 +++ .../AutoMLExperiment/AutoMLExperiment.cs | 51 ++++++++++++++----- .../AutoMLExperiment/IMonitor.cs | 16 ++++-- .../AutoMLExperiment/IPerformanceMonitor.cs | 24 ++++++--- .../TrialPerformanceMetrics.cs | 19 +++++++ .../AutoMLExperiment/TrialSettings.cs | 9 +++- .../AutoMLExperimentTests.cs | 7 +-- 7 files changed, 99 insertions(+), 33 deletions(-) create mode 100644 src/Microsoft.ML.AutoML/AutoMLExperiment/TrialPerformanceMetrics.cs diff --git a/src/Microsoft.ML.AutoML.Interactive/NotebookMonitor.cs b/src/Microsoft.ML.AutoML.Interactive/NotebookMonitor.cs index 81b22a9435..42555ea749 100644 --- a/src/Microsoft.ML.AutoML.Interactive/NotebookMonitor.cs +++ b/src/Microsoft.ML.AutoML.Interactive/NotebookMonitor.cs @@ -24,6 +24,8 @@ public class NotebookMonitor : IMonitor public List CompletedTrials { get; set; } public DataFrame TrialData { get; set; } + public int ResourceUsageCheckInterval => 5000; + public NotebookMonitor(SweepablePipeline pipeline) { CompletedTrials = new List(); @@ -84,5 +86,9 @@ public void SetUpdate(DisplayedValue valueToUpdate) _valueToUpdate = valueToUpdate; ThrottledUpdate(); } + + public void ReportTrialResourceUsage(TrialSettings setting) + { + } } } diff --git a/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs b/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs index 60d2790f28..6d171cfdd0 100644 --- a/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs +++ b/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs @@ -3,6 +3,7 @@ // See the LICENSE file in the project root for more information. using System; +using System.Diagnostics; using System.Linq; using System.Text.Json; using System.Threading; @@ -249,19 +250,37 @@ public async Task RunAsync(CancellationToken ct = default) var trialNum = trialResultManager?.GetAllTrialResults().Max(t => t.TrialSettings?.TrialId) + 1 ?? 0; var tuner = serviceProvider.GetService(); Contracts.Assert(tuner != null, "tuner can't be null"); + while (!aggregateTrainingStopManager.IsStopTrainingRequested()) { - var setting = new TrialSettings() + var trialSettings = new TrialSettings() { TrialId = trialNum++, Parameter = Parameter.CreateNestedParameter(), + StartedAtUtc = DateTime.UtcNow, + CancellationTokenSource = null, + PerformanceMetrics = new TrialPerformanceMetrics(), }; - var parameter = tuner.Propose(setting); - setting.Parameter = parameter; + var parameter = tuner.Propose(trialSettings); + trialSettings.Parameter = parameter; - monitor?.ReportRunningTrial(setting); using (var trialCancellationTokenSource = new CancellationTokenSource()) { + trialSettings.CancellationTokenSource = trialCancellationTokenSource; + monitor?.ReportRunningTrial(trialSettings); + + System.Timers.Timer resourceUsageTimer = null; + if ((monitor != null) && (monitor?.ResourceUsageCheckInterval > 0)) + { + resourceUsageTimer = new System.Timers.Timer(monitor.ResourceUsageCheckInterval); + resourceUsageTimer.Elapsed += (o, e) => + { + monitor?.ReportTrialResourceUsage(trialSettings); + }; + resourceUsageTimer.AutoReset = true; + resourceUsageTimer.Enabled = false; + } + void handler(object o, EventArgs e) { // only force-canceling running trials when there's completed trials. @@ -276,21 +295,25 @@ void handler(object o, EventArgs e) { aggregateTrainingStopManager.OnStopTraining += handler; - performanceMonitor.MemoryUsageInMegaByte += (o, m) => + performanceMonitor.PerformanceMetricsUpdated += (o, metrics) => { - if (_settings.MaximumMemoryUsageInMegaByte is double d && m > d && !trialCancellationTokenSource.IsCancellationRequested) + trialSettings.PerformanceMetrics = metrics; + + if (_settings.MaximumMemoryUsageInMegaByte is double d && metrics.PeakMemoryUsage > d && !trialCancellationTokenSource.IsCancellationRequested) { - logger.Trace($"cancel current trial {setting.TrialId} because it uses {m} mb memory and the maximum memory usage is {d}"); + logger.Trace($"cancel current trial {trialSettings.TrialId} because it uses {metrics.PeakMemoryUsage} mb memory and the maximum memory usage is {d}"); trialCancellationTokenSource.Cancel(); - GC.AddMemoryPressure(Convert.ToInt64(m) * 1024 * 1024); + GC.AddMemoryPressure(Convert.ToInt64(metrics.PeakMemoryUsage) * 1024 * 1024); GC.Collect(); } }; + var trialTask = runner.RunAsync(trialSettings, trialCancellationTokenSource.Token); performanceMonitor.Start(); - logger.Trace($"trial setting - {JsonSerializer.Serialize(setting)}"); - var trialResult = await runner.RunAsync(setting, trialCancellationTokenSource.Token); + resourceUsageTimer?.Start(); + logger.Trace($"trial setting - {JsonSerializer.Serialize(trialSettings)}"); + var trialResult = await trialTask; var peakCpu = performanceMonitor?.GetPeakCpuUsage(); var peakMemoryInMB = performanceMonitor?.GetPeakMemoryUsageInMegaByte(); @@ -313,10 +336,10 @@ void handler(object o, EventArgs e) } catch (OperationCanceledException ex) when (aggregateTrainingStopManager.IsStopTrainingRequested() == false) { - monitor?.ReportFailTrial(setting, ex); + monitor?.ReportFailTrial(trialSettings, ex); var result = new TrialResult { - TrialSettings = setting, + TrialSettings = trialSettings, Loss = double.MaxValue, }; @@ -329,7 +352,7 @@ void handler(object o, EventArgs e) } catch (Exception ex) { - monitor?.ReportFailTrial(setting, ex); + monitor?.ReportFailTrial(trialSettings, ex); if (!aggregateTrainingStopManager.IsStopTrainingRequested() && _bestTrialResult == null) { @@ -343,7 +366,7 @@ void handler(object o, EventArgs e) finally { aggregateTrainingStopManager.OnStopTraining -= handler; - + resourceUsageTimer?.Stop(); } } } diff --git a/src/Microsoft.ML.AutoML/AutoMLExperiment/IMonitor.cs b/src/Microsoft.ML.AutoML/AutoMLExperiment/IMonitor.cs index 6a28d80010..fc40e74918 100644 --- a/src/Microsoft.ML.AutoML/AutoMLExperiment/IMonitor.cs +++ b/src/Microsoft.ML.AutoML/AutoMLExperiment/IMonitor.cs @@ -13,13 +13,12 @@ namespace Microsoft.ML.AutoML /// public interface IMonitor { + int ResourceUsageCheckInterval { get; } void ReportCompletedTrial(TrialResult result); - void ReportBestTrial(TrialResult result); - void ReportFailTrial(TrialSettings settings, Exception exception = null); - - void ReportRunningTrial(TrialSettings setting); + void ReportRunningTrial(TrialSettings settings); + void ReportTrialResourceUsage(TrialSettings settings); } /// @@ -30,11 +29,14 @@ internal class MLContextMonitor : IMonitor private readonly IChannel _logger; private readonly List _completedTrials; private readonly SweepablePipeline _pipeline; - public MLContextMonitor(IChannel logger, SweepablePipeline pipeline) + public int ResourceUsageCheckInterval { get; private set; } + + public MLContextMonitor(IChannel logger, SweepablePipeline pipeline, int resourceUsageCheckInterval = 6000) { _logger = logger; _completedTrials = new List(); _pipeline = pipeline; + ResourceUsageCheckInterval = resourceUsageCheckInterval; } public virtual void ReportBestTrial(TrialResult result) @@ -57,6 +59,10 @@ public virtual void ReportRunningTrial(TrialSettings setting) { _logger.Info($"Update Running Trial - Id: {setting.TrialId} - Pipeline: {_pipeline.ToString(setting.Parameter)}"); } + + public void ReportTrialResourceUsage(TrialSettings setting) + { + } } internal class TrialResultMonitor : MLContextMonitor diff --git a/src/Microsoft.ML.AutoML/AutoMLExperiment/IPerformanceMonitor.cs b/src/Microsoft.ML.AutoML/AutoMLExperiment/IPerformanceMonitor.cs index fa86bb5894..3342992b56 100644 --- a/src/Microsoft.ML.AutoML/AutoMLExperiment/IPerformanceMonitor.cs +++ b/src/Microsoft.ML.AutoML/AutoMLExperiment/IPerformanceMonitor.cs @@ -5,6 +5,8 @@ using System; using System.Collections.Generic; using System.Diagnostics; +using System.IO; +using System.Linq; using System.Text; using System.Threading.Tasks; using System.Timers; @@ -22,9 +24,7 @@ internal interface IPerformanceMonitor : IDisposable double? GetPeakCpuUsage(); - public event EventHandler CpuUsage; - - public event EventHandler MemoryUsageInMegaByte; + public event EventHandler PerformanceMetricsUpdated; } internal class DefaultPerformanceMonitor : IPerformanceMonitor @@ -43,9 +43,7 @@ public DefaultPerformanceMonitor(IChannel logger, int checkIntervalInMillisecond } - public event EventHandler CpuUsage; - - public event EventHandler MemoryUsageInMegaByte; + public event EventHandler PerformanceMetricsUpdated; public void Dispose() @@ -110,9 +108,19 @@ private void SampleCpuAndMemoryUsage() // calculate Memory Usage in MB var memoryUsage = process.PrivateMemorySize64 * 1.0 / (1024 * 1024); _peakMemoryUsage = Math.Max(memoryUsage, _peakMemoryUsage ?? 0); + + var metrics = new TrialPerformanceMetrics() + { + CpuUsage = cpuUsageInTotal, + MemoryUsage = memoryUsage, + PeakCpuUsage = _peakCpuUsage, + PeakMemoryUsage = _peakMemoryUsage, + FreeSpaceOnDrives = DriveInfo.GetDrives().Select(d => (float)d.AvailableFreeSpace / (1024 * 1024)).ToArray() + }; + _logger?.Trace($"current CPU: {cpuUsageInTotal}, current Memory(mb): {memoryUsage}"); - MemoryUsageInMegaByte?.Invoke(this, memoryUsage); - CpuUsage?.Invoke(this, cpuUsageInTotal); + + PerformanceMetricsUpdated?.Invoke(this, metrics); } } } diff --git a/src/Microsoft.ML.AutoML/AutoMLExperiment/TrialPerformanceMetrics.cs b/src/Microsoft.ML.AutoML/AutoMLExperiment/TrialPerformanceMetrics.cs new file mode 100644 index 0000000000..a83e58155a --- /dev/null +++ b/src/Microsoft.ML.AutoML/AutoMLExperiment/TrialPerformanceMetrics.cs @@ -0,0 +1,19 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Text; + +namespace Microsoft.ML.AutoML +{ + public class TrialPerformanceMetrics + { + public double? PeakMemoryUsage { get; set; } + public double? PeakCpuUsage { get; set; } + public double CpuUsage { get; internal set; } + public double MemoryUsage { get; internal set; } + public float[] FreeSpaceOnDrives { get; internal set; } + } +} diff --git a/src/Microsoft.ML.AutoML/AutoMLExperiment/TrialSettings.cs b/src/Microsoft.ML.AutoML/AutoMLExperiment/TrialSettings.cs index 38d14c2a6d..2d11cb882a 100644 --- a/src/Microsoft.ML.AutoML/AutoMLExperiment/TrialSettings.cs +++ b/src/Microsoft.ML.AutoML/AutoMLExperiment/TrialSettings.cs @@ -2,6 +2,10 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +using System; +using System.Runtime.Serialization; +using System.Text.Json.Serialization; +using System.Threading; using Microsoft.ML.SearchSpace; namespace Microsoft.ML.AutoML @@ -9,7 +13,10 @@ namespace Microsoft.ML.AutoML public class TrialSettings { public int TrialId { get; set; } - public Parameter Parameter { get; set; } + [JsonIgnore] + public CancellationTokenSource CancellationTokenSource { get; set; } + public TrialPerformanceMetrics PerformanceMetrics { get; internal set; } + public DateTime StartedAtUtc { get; internal set; } } } diff --git a/test/Microsoft.ML.AutoML.Tests/AutoMLExperimentTests.cs b/test/Microsoft.ML.AutoML.Tests/AutoMLExperimentTests.cs index 6d14b266bb..0d7c5be3d7 100644 --- a/test/Microsoft.ML.AutoML.Tests/AutoMLExperimentTests.cs +++ b/test/Microsoft.ML.AutoML.Tests/AutoMLExperimentTests.cs @@ -407,9 +407,7 @@ public DummyPeformanceMonitor() _checkIntervalInMilliseconds = 1000; } - public event EventHandler CpuUsage; - - public event EventHandler MemoryUsageInMegaByte; + public event EventHandler PerformanceMetricsUpdated; public void Dispose() { @@ -432,8 +430,7 @@ public void Start() _timer = new System.Timers.Timer(_checkIntervalInMilliseconds); _timer.Elapsed += (o, e) => { - CpuUsage?.Invoke(this, 100); - MemoryUsageInMegaByte?.Invoke(this, 1000); + PerformanceMetricsUpdated?.Invoke(this, new TrialPerformanceMetrics() { PeakCpuUsage = 100, PeakMemoryUsage = 1000 }); }; _timer.AutoReset = true; From e3fd9920cb94b7fd7c91f2a0da3e8a2eec2fc0d7 Mon Sep 17 00:00:00 2001 From: Andras Fuchs Date: Fri, 25 Nov 2022 10:15:25 +0100 Subject: [PATCH 04/24] Add new class and property explanations --- .../AutoMLExperiment/IMonitor.cs | 3 +++ .../TrialPerformanceMetrics.cs | 18 ++++++++++++++++++ .../AutoMLExperiment/TrialSettings.cs | 18 ++++++++++++++++++ 3 files changed, 39 insertions(+) diff --git a/src/Microsoft.ML.AutoML/AutoMLExperiment/IMonitor.cs b/src/Microsoft.ML.AutoML/AutoMLExperiment/IMonitor.cs index fc40e74918..3799e56644 100644 --- a/src/Microsoft.ML.AutoML/AutoMLExperiment/IMonitor.cs +++ b/src/Microsoft.ML.AutoML/AutoMLExperiment/IMonitor.cs @@ -13,6 +13,9 @@ namespace Microsoft.ML.AutoML /// public interface IMonitor { + /// + /// Interval in milliseconds to report resource usage. + /// int ResourceUsageCheckInterval { get; } void ReportCompletedTrial(TrialResult result); void ReportBestTrial(TrialResult result); diff --git a/src/Microsoft.ML.AutoML/AutoMLExperiment/TrialPerformanceMetrics.cs b/src/Microsoft.ML.AutoML/AutoMLExperiment/TrialPerformanceMetrics.cs index a83e58155a..454e73e645 100644 --- a/src/Microsoft.ML.AutoML/AutoMLExperiment/TrialPerformanceMetrics.cs +++ b/src/Microsoft.ML.AutoML/AutoMLExperiment/TrialPerformanceMetrics.cs @@ -8,12 +8,30 @@ namespace Microsoft.ML.AutoML { + /// + /// Performance metrics for a trial. + /// public class TrialPerformanceMetrics { + /// + /// Peak memory usage during the trial in megabytes + /// public double? PeakMemoryUsage { get; set; } + /// + /// Peak CPU usage during the trial + /// public double? PeakCpuUsage { get; set; } + /// + /// Current CPU usage of the runner process + /// public double CpuUsage { get; internal set; } + /// + /// Current memory usage of the runner process in megabytes + /// public double MemoryUsage { get; internal set; } + /// + /// The free space available on each drive in megabytes + /// public float[] FreeSpaceOnDrives { get; internal set; } } } diff --git a/src/Microsoft.ML.AutoML/AutoMLExperiment/TrialSettings.cs b/src/Microsoft.ML.AutoML/AutoMLExperiment/TrialSettings.cs index 2d11cb882a..55f5b46de0 100644 --- a/src/Microsoft.ML.AutoML/AutoMLExperiment/TrialSettings.cs +++ b/src/Microsoft.ML.AutoML/AutoMLExperiment/TrialSettings.cs @@ -10,13 +10,31 @@ namespace Microsoft.ML.AutoML { + /// + /// Settings used for the trial + /// public class TrialSettings { + /// + /// Identifier of the trial + /// public int TrialId { get; set; } + /// + /// Parameters for the pipeline used in this trial + /// public Parameter Parameter { get; set; } + /// + /// Cancellation token source to have the ability to cancel the trial + /// [JsonIgnore] public CancellationTokenSource CancellationTokenSource { get; set; } + /// + /// Performance metrics of the trial + /// public TrialPerformanceMetrics PerformanceMetrics { get; internal set; } + /// + /// The time when the trial started (UTC) + /// public DateTime StartedAtUtc { get; internal set; } } } From 88fdefa67c695aac3e8833beb7da1d3eec360026 Mon Sep 17 00:00:00 2001 From: Andras Fuchs Date: Fri, 2 Dec 2022 10:21:34 +0100 Subject: [PATCH 05/24] Revert "Fix trial cancellation bug" This reverts commit 269b1bdd93696cffcb79a6b3140bdb9f01bbb7ce. --- .../Runner/SweepablePipelineRunner.cs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/Microsoft.ML.AutoML/AutoMLExperiment/Runner/SweepablePipelineRunner.cs b/src/Microsoft.ML.AutoML/AutoMLExperiment/Runner/SweepablePipelineRunner.cs index e8334660b8..d42363bde0 100644 --- a/src/Microsoft.ML.AutoML/AutoMLExperiment/Runner/SweepablePipelineRunner.cs +++ b/src/Microsoft.ML.AutoML/AutoMLExperiment/Runner/SweepablePipelineRunner.cs @@ -15,14 +15,13 @@ namespace Microsoft.ML.AutoML { - public class SweepablePipelineRunner : ITrialRunner + internal class SweepablePipelineRunner : ITrialRunner { private MLContext? _mLContext; private readonly IEvaluateMetricManager _metricManager; private readonly IDatasetManager _datasetManager; private readonly SweepablePipeline _pipeline; private readonly IChannel? _logger; - private CancellationTokenRegistration _ctRegistration; public SweepablePipelineRunner(MLContext context, SweepablePipeline pipeline, IEvaluateMetricManager metricManager, IDatasetManager datasetManager, IChannel? logger = null) { @@ -92,12 +91,13 @@ public Task RunAsync(TrialSettings settings, CancellationToken ct) { try { - _ctRegistration = ct.Register(() => + using (var ctRegistration = ct.Register(() => { _mLContext?.CancelExecution(); - }); - - return Task.Run(() => Run(settings)); + })) + { + return Task.Run(() => Run(settings)); + } } catch (Exception ex) when (ct.IsCancellationRequested) { From b03e46aa9ed07c2cbdf98f0beb30fb74ccf816fd Mon Sep 17 00:00:00 2001 From: Andras Fuchs Date: Tue, 6 Dec 2022 12:24:13 +0100 Subject: [PATCH 06/24] Remove pipeline info from the IMonitor Running event --- src/Microsoft.ML.AutoML/AutoMLExperiment/IMonitor.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Microsoft.ML.AutoML/AutoMLExperiment/IMonitor.cs b/src/Microsoft.ML.AutoML/AutoMLExperiment/IMonitor.cs index 3799e56644..4e6157b755 100644 --- a/src/Microsoft.ML.AutoML/AutoMLExperiment/IMonitor.cs +++ b/src/Microsoft.ML.AutoML/AutoMLExperiment/IMonitor.cs @@ -60,7 +60,7 @@ public virtual void ReportFailTrial(TrialSettings settings, Exception exception public virtual void ReportRunningTrial(TrialSettings setting) { - _logger.Info($"Update Running Trial - Id: {setting.TrialId} - Pipeline: {_pipeline.ToString(setting.Parameter)}"); + _logger.Info($"Update Running Trial - Id: {setting.TrialId}"); } public void ReportTrialResourceUsage(TrialSettings setting) From bf69dd21ede552be9bb4d796eec86fbe4f0cc2c0 Mon Sep 17 00:00:00 2001 From: Andras Fuchs Date: Tue, 6 Dec 2022 12:25:54 +0100 Subject: [PATCH 07/24] Remove FreeSpaceOnDrives from TrialPerformanceMetrics --- .../AutoMLExperiment/IPerformanceMonitor.cs | 3 +-- .../AutoMLExperiment/TrialPerformanceMetrics.cs | 4 ---- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/src/Microsoft.ML.AutoML/AutoMLExperiment/IPerformanceMonitor.cs b/src/Microsoft.ML.AutoML/AutoMLExperiment/IPerformanceMonitor.cs index 3342992b56..d29fcfcdd4 100644 --- a/src/Microsoft.ML.AutoML/AutoMLExperiment/IPerformanceMonitor.cs +++ b/src/Microsoft.ML.AutoML/AutoMLExperiment/IPerformanceMonitor.cs @@ -114,8 +114,7 @@ private void SampleCpuAndMemoryUsage() CpuUsage = cpuUsageInTotal, MemoryUsage = memoryUsage, PeakCpuUsage = _peakCpuUsage, - PeakMemoryUsage = _peakMemoryUsage, - FreeSpaceOnDrives = DriveInfo.GetDrives().Select(d => (float)d.AvailableFreeSpace / (1024 * 1024)).ToArray() + PeakMemoryUsage = _peakMemoryUsage }; _logger?.Trace($"current CPU: {cpuUsageInTotal}, current Memory(mb): {memoryUsage}"); diff --git a/src/Microsoft.ML.AutoML/AutoMLExperiment/TrialPerformanceMetrics.cs b/src/Microsoft.ML.AutoML/AutoMLExperiment/TrialPerformanceMetrics.cs index 454e73e645..e35ca8d47a 100644 --- a/src/Microsoft.ML.AutoML/AutoMLExperiment/TrialPerformanceMetrics.cs +++ b/src/Microsoft.ML.AutoML/AutoMLExperiment/TrialPerformanceMetrics.cs @@ -29,9 +29,5 @@ public class TrialPerformanceMetrics /// Current memory usage of the runner process in megabytes /// public double MemoryUsage { get; internal set; } - /// - /// The free space available on each drive in megabytes - /// - public float[] FreeSpaceOnDrives { get; internal set; } } } From 38cf838ecbe6710ce46d11dfc772cb49e2241579 Mon Sep 17 00:00:00 2001 From: Andras Fuchs Date: Tue, 6 Dec 2022 12:29:08 +0100 Subject: [PATCH 08/24] Change the default resource check interval to 5 seconds --- src/Microsoft.ML.AutoML/AutoMLExperiment/IMonitor.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Microsoft.ML.AutoML/AutoMLExperiment/IMonitor.cs b/src/Microsoft.ML.AutoML/AutoMLExperiment/IMonitor.cs index 4e6157b755..4ab941d1d9 100644 --- a/src/Microsoft.ML.AutoML/AutoMLExperiment/IMonitor.cs +++ b/src/Microsoft.ML.AutoML/AutoMLExperiment/IMonitor.cs @@ -34,7 +34,7 @@ internal class MLContextMonitor : IMonitor private readonly SweepablePipeline _pipeline; public int ResourceUsageCheckInterval { get; private set; } - public MLContextMonitor(IChannel logger, SweepablePipeline pipeline, int resourceUsageCheckInterval = 6000) + public MLContextMonitor(IChannel logger, SweepablePipeline pipeline, int resourceUsageCheckInterval = 5000) { _logger = logger; _completedTrials = new List(); From 7f40df50c0d8e04cf0ccbead266c109158e69763 Mon Sep 17 00:00:00 2001 From: Andras Fuchs Date: Thu, 22 Dec 2022 10:19:01 +0100 Subject: [PATCH 09/24] Remove StartedAtUtc property from TrialSettings --- src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs | 1 - src/Microsoft.ML.AutoML/AutoMLExperiment/TrialSettings.cs | 4 ---- 2 files changed, 5 deletions(-) diff --git a/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs b/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs index 6d171cfdd0..286607247c 100644 --- a/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs +++ b/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs @@ -257,7 +257,6 @@ public async Task RunAsync(CancellationToken ct = default) { TrialId = trialNum++, Parameter = Parameter.CreateNestedParameter(), - StartedAtUtc = DateTime.UtcNow, CancellationTokenSource = null, PerformanceMetrics = new TrialPerformanceMetrics(), }; diff --git a/src/Microsoft.ML.AutoML/AutoMLExperiment/TrialSettings.cs b/src/Microsoft.ML.AutoML/AutoMLExperiment/TrialSettings.cs index 55f5b46de0..1a0dd29f69 100644 --- a/src/Microsoft.ML.AutoML/AutoMLExperiment/TrialSettings.cs +++ b/src/Microsoft.ML.AutoML/AutoMLExperiment/TrialSettings.cs @@ -32,9 +32,5 @@ public class TrialSettings /// Performance metrics of the trial /// public TrialPerformanceMetrics PerformanceMetrics { get; internal set; } - /// - /// The time when the trial started (UTC) - /// - public DateTime StartedAtUtc { get; internal set; } } } From 8aa0ad88c7b73d5a73936a00cb88252787bd68fd Mon Sep 17 00:00:00 2001 From: XiaoYun Zhang Date: Tue, 3 Jan 2023 11:58:48 -0800 Subject: [PATCH 10/24] move ReportTrialResourceUsage to IPerformanceMonitor --- .../NotebookMonitor.cs | 6 --- .../API/AutoMLExperimentExtension.cs | 41 ++++++++++++++++- .../AutoMLExperiment/AutoMLExperiment.cs | 44 +------------------ .../AutoMLExperiment/IMonitor.cs | 16 ++----- .../AutoMLExperiment/IPerformanceMonitor.cs | 28 ++++++++++-- .../AutoMLExperiment/TrialSettings.cs | 9 ---- .../AutoMLExperimentTests.cs | 7 ++- 7 files changed, 74 insertions(+), 77 deletions(-) diff --git a/src/Microsoft.ML.AutoML.Interactive/NotebookMonitor.cs b/src/Microsoft.ML.AutoML.Interactive/NotebookMonitor.cs index 42555ea749..81b22a9435 100644 --- a/src/Microsoft.ML.AutoML.Interactive/NotebookMonitor.cs +++ b/src/Microsoft.ML.AutoML.Interactive/NotebookMonitor.cs @@ -24,8 +24,6 @@ public class NotebookMonitor : IMonitor public List CompletedTrials { get; set; } public DataFrame TrialData { get; set; } - public int ResourceUsageCheckInterval => 5000; - public NotebookMonitor(SweepablePipeline pipeline) { CompletedTrials = new List(); @@ -86,9 +84,5 @@ public void SetUpdate(DisplayedValue valueToUpdate) _valueToUpdate = valueToUpdate; ThrottledUpdate(); } - - public void ReportTrialResourceUsage(TrialSettings setting) - { - } } } diff --git a/src/Microsoft.ML.AutoML/API/AutoMLExperimentExtension.cs b/src/Microsoft.ML.AutoML/API/AutoMLExperimentExtension.cs index b2437588d0..e0dacdf055 100644 --- a/src/Microsoft.ML.AutoML/API/AutoMLExperimentExtension.cs +++ b/src/Microsoft.ML.AutoML/API/AutoMLExperimentExtension.cs @@ -149,18 +149,55 @@ public static AutoMLExperiment SetPipeline(this AutoMLExperiment experiment, Swe return experiment; } + /// + /// Set as for . + /// + /// + /// the interval in milliseconds for to sample + /// public static AutoMLExperiment SetPerformanceMonitor(this AutoMLExperiment experiment, int checkIntervalInMilliseconds = 1000) { experiment.SetPerformanceMonitor((service) => { var channel = service.GetService(); - - return new DefaultPerformanceMonitor(channel, checkIntervalInMilliseconds); + var settings = service.GetRequiredService(); + return new DefaultPerformanceMonitor(settings, channel, checkIntervalInMilliseconds); }); return experiment; } + /// + /// Set a custom performance monitor as for . + /// + /// + /// + /// + /// + public static AutoMLExperiment SetPerformanceMonitor(this AutoMLExperiment experiment, Func factory) + where TPerformanceMonitor : class, IPerformanceMonitor + + { + experiment.ServiceCollection.AddTransient(factory); + + return experiment; + } + + /// + /// Set a custom performance monitor as for . + /// + /// + /// + /// + public static AutoMLExperiment SetPerformanceMonitor(this AutoMLExperiment experiment) + where TPerformanceMonitor : class, IPerformanceMonitor + + { + experiment.ServiceCollection.AddTransient(); + + return experiment; + } + /// /// Set as tuner for hyper-parameter optimization. The performance of smac is in a large extend determined /// by , and , which are used to fit smac's inner diff --git a/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs b/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs index 286607247c..2af5414fa2 100644 --- a/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs +++ b/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs @@ -194,22 +194,6 @@ public AutoMLExperiment SetTuner() return this; } - internal AutoMLExperiment SetPerformanceMonitor() - where TPerformanceMonitor : class, IPerformanceMonitor - { - _serviceCollection.AddTransient(); - - return this; - } - - internal AutoMLExperiment SetPerformanceMonitor(Func factory) - where TPerformanceMonitor : class, IPerformanceMonitor - { - _serviceCollection.AddTransient(factory); - - return this; - } - /// /// Run experiment and return the best trial result synchronizely. /// @@ -257,29 +241,14 @@ public async Task RunAsync(CancellationToken ct = default) { TrialId = trialNum++, Parameter = Parameter.CreateNestedParameter(), - CancellationTokenSource = null, - PerformanceMetrics = new TrialPerformanceMetrics(), }; var parameter = tuner.Propose(trialSettings); trialSettings.Parameter = parameter; using (var trialCancellationTokenSource = new CancellationTokenSource()) { - trialSettings.CancellationTokenSource = trialCancellationTokenSource; monitor?.ReportRunningTrial(trialSettings); - System.Timers.Timer resourceUsageTimer = null; - if ((monitor != null) && (monitor?.ResourceUsageCheckInterval > 0)) - { - resourceUsageTimer = new System.Timers.Timer(monitor.ResourceUsageCheckInterval); - resourceUsageTimer.Elapsed += (o, e) => - { - monitor?.ReportTrialResourceUsage(trialSettings); - }; - resourceUsageTimer.AutoReset = true; - resourceUsageTimer.Enabled = false; - } - void handler(object o, EventArgs e) { // only force-canceling running trials when there's completed trials. @@ -296,21 +265,11 @@ void handler(object o, EventArgs e) performanceMonitor.PerformanceMetricsUpdated += (o, metrics) => { - trialSettings.PerformanceMetrics = metrics; - - if (_settings.MaximumMemoryUsageInMegaByte is double d && metrics.PeakMemoryUsage > d && !trialCancellationTokenSource.IsCancellationRequested) - { - logger.Trace($"cancel current trial {trialSettings.TrialId} because it uses {metrics.PeakMemoryUsage} mb memory and the maximum memory usage is {d}"); - trialCancellationTokenSource.Cancel(); - - GC.AddMemoryPressure(Convert.ToInt64(metrics.PeakMemoryUsage) * 1024 * 1024); - GC.Collect(); - } + performanceMonitor.OnPerformanceMetricsUpdatedHandler(trialSettings, metrics, trialCancellationTokenSource); }; var trialTask = runner.RunAsync(trialSettings, trialCancellationTokenSource.Token); performanceMonitor.Start(); - resourceUsageTimer?.Start(); logger.Trace($"trial setting - {JsonSerializer.Serialize(trialSettings)}"); var trialResult = await trialTask; @@ -365,7 +324,6 @@ void handler(object o, EventArgs e) finally { aggregateTrainingStopManager.OnStopTraining -= handler; - resourceUsageTimer?.Stop(); } } } diff --git a/src/Microsoft.ML.AutoML/AutoMLExperiment/IMonitor.cs b/src/Microsoft.ML.AutoML/AutoMLExperiment/IMonitor.cs index 4ab941d1d9..51335f8d94 100644 --- a/src/Microsoft.ML.AutoML/AutoMLExperiment/IMonitor.cs +++ b/src/Microsoft.ML.AutoML/AutoMLExperiment/IMonitor.cs @@ -13,15 +13,13 @@ namespace Microsoft.ML.AutoML /// public interface IMonitor { - /// - /// Interval in milliseconds to report resource usage. - /// - int ResourceUsageCheckInterval { get; } void ReportCompletedTrial(TrialResult result); + void ReportBestTrial(TrialResult result); + void ReportFailTrial(TrialSettings settings, Exception exception = null); + void ReportRunningTrial(TrialSettings settings); - void ReportTrialResourceUsage(TrialSettings settings); } /// @@ -32,14 +30,12 @@ internal class MLContextMonitor : IMonitor private readonly IChannel _logger; private readonly List _completedTrials; private readonly SweepablePipeline _pipeline; - public int ResourceUsageCheckInterval { get; private set; } - public MLContextMonitor(IChannel logger, SweepablePipeline pipeline, int resourceUsageCheckInterval = 5000) + public MLContextMonitor(IChannel logger, SweepablePipeline pipeline) { _logger = logger; _completedTrials = new List(); _pipeline = pipeline; - ResourceUsageCheckInterval = resourceUsageCheckInterval; } public virtual void ReportBestTrial(TrialResult result) @@ -62,10 +58,6 @@ public virtual void ReportRunningTrial(TrialSettings setting) { _logger.Info($"Update Running Trial - Id: {setting.TrialId}"); } - - public void ReportTrialResourceUsage(TrialSettings setting) - { - } } internal class TrialResultMonitor : MLContextMonitor diff --git a/src/Microsoft.ML.AutoML/AutoMLExperiment/IPerformanceMonitor.cs b/src/Microsoft.ML.AutoML/AutoMLExperiment/IPerformanceMonitor.cs index d29fcfcdd4..1e0d07d3b3 100644 --- a/src/Microsoft.ML.AutoML/AutoMLExperiment/IPerformanceMonitor.cs +++ b/src/Microsoft.ML.AutoML/AutoMLExperiment/IPerformanceMonitor.cs @@ -8,13 +8,15 @@ using System.IO; using System.Linq; using System.Text; +using System.Threading; using System.Threading.Tasks; using System.Timers; using Microsoft.ML.Runtime; +using Timer = System.Timers.Timer; namespace Microsoft.ML.AutoML { - internal interface IPerformanceMonitor : IDisposable + public interface IPerformanceMonitor : IDisposable { void Start(); @@ -24,20 +26,28 @@ internal interface IPerformanceMonitor : IDisposable double? GetPeakCpuUsage(); + /// + /// The handler function every time get fired. + /// + void OnPerformanceMetricsUpdatedHandler(TrialSettings trialSettings, TrialPerformanceMetrics metrics, CancellationTokenSource trialCancellationTokenSource); + + public event EventHandler PerformanceMetricsUpdated; } - internal class DefaultPerformanceMonitor : IPerformanceMonitor + public class DefaultPerformanceMonitor : IPerformanceMonitor { private readonly IChannel _logger; + private readonly AutoMLExperiment.AutoMLExperimentSettings _settings; private Timer _timer; private double? _peakCpuUsage; private double? _peakMemoryUsage; private readonly int _checkIntervalInMilliseconds; private TimeSpan _totalCpuProcessorTime; - public DefaultPerformanceMonitor(IChannel logger, int checkIntervalInMilliseconds) + public DefaultPerformanceMonitor(AutoMLExperiment.AutoMLExperimentSettings settings, IChannel logger, int checkIntervalInMilliseconds) { + _settings = settings; _logger = logger; _checkIntervalInMilliseconds = checkIntervalInMilliseconds; } @@ -122,5 +132,17 @@ private void SampleCpuAndMemoryUsage() PerformanceMetricsUpdated?.Invoke(this, metrics); } } + + public virtual void OnPerformanceMetricsUpdatedHandler(TrialSettings trialSettings, TrialPerformanceMetrics metrics, CancellationTokenSource trialCancellationTokenSource) + { + if (_settings.MaximumMemoryUsageInMegaByte is double d && metrics.PeakMemoryUsage > d && !trialCancellationTokenSource.IsCancellationRequested) + { + _logger.Trace($"cancel current trial {trialSettings.TrialId} because it uses {metrics.PeakMemoryUsage} mb memory and the maximum memory usage is {d}"); + trialCancellationTokenSource.Cancel(); + + GC.AddMemoryPressure(Convert.ToInt64(metrics.PeakMemoryUsage) * 1024 * 1024); + GC.Collect(); + } + } } } diff --git a/src/Microsoft.ML.AutoML/AutoMLExperiment/TrialSettings.cs b/src/Microsoft.ML.AutoML/AutoMLExperiment/TrialSettings.cs index 1a0dd29f69..9980fc1366 100644 --- a/src/Microsoft.ML.AutoML/AutoMLExperiment/TrialSettings.cs +++ b/src/Microsoft.ML.AutoML/AutoMLExperiment/TrialSettings.cs @@ -23,14 +23,5 @@ public class TrialSettings /// Parameters for the pipeline used in this trial /// public Parameter Parameter { get; set; } - /// - /// Cancellation token source to have the ability to cancel the trial - /// - [JsonIgnore] - public CancellationTokenSource CancellationTokenSource { get; set; } - /// - /// Performance metrics of the trial - /// - public TrialPerformanceMetrics PerformanceMetrics { get; internal set; } } } diff --git a/test/Microsoft.ML.AutoML.Tests/AutoMLExperimentTests.cs b/test/Microsoft.ML.AutoML.Tests/AutoMLExperimentTests.cs index 0d7c5be3d7..d22c654403 100644 --- a/test/Microsoft.ML.AutoML.Tests/AutoMLExperimentTests.cs +++ b/test/Microsoft.ML.AutoML.Tests/AutoMLExperimentTests.cs @@ -81,8 +81,7 @@ public async Task AutoMLExperiment_cancel_trial_when_exceeds_memory_limit_Async( return new DummyTrialRunner(settings, 5, channel); }) .SetTuner() - .SetMaximumMemoryUsageInMegaByte(0.01) - .SetPerformanceMonitor(); + .SetMaximumMemoryUsageInMegaByte(0.01); var runExperimentAction = async () => await experiment.RunAsync(); await runExperimentAction.Should().ThrowExactlyAsync(); @@ -423,6 +422,10 @@ public void Dispose() return 1000; } + public void OnPerformanceMetricsUpdatedHandler(TrialSettings trialSettings, TrialPerformanceMetrics metrics, CancellationTokenSource trialCancellationTokenSource) + { + } + public void Start() { if (_timer == null) From 739d8659c6c32c9466158727fae4e774338f0954 Mon Sep 17 00:00:00 2001 From: Xiaoyun Zhang Date: Tue, 3 Jan 2023 12:02:24 -0800 Subject: [PATCH 11/24] Update AutoMLExperimentExtension.cs --- src/Microsoft.ML.AutoML/API/AutoMLExperimentExtension.cs | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/Microsoft.ML.AutoML/API/AutoMLExperimentExtension.cs b/src/Microsoft.ML.AutoML/API/AutoMLExperimentExtension.cs index e0dacdf055..35d0a679ad 100644 --- a/src/Microsoft.ML.AutoML/API/AutoMLExperimentExtension.cs +++ b/src/Microsoft.ML.AutoML/API/AutoMLExperimentExtension.cs @@ -176,7 +176,6 @@ public static AutoMLExperiment SetPerformanceMonitor(this AutoMLExperiment exper /// public static AutoMLExperiment SetPerformanceMonitor(this AutoMLExperiment experiment, Func factory) where TPerformanceMonitor : class, IPerformanceMonitor - { experiment.ServiceCollection.AddTransient(factory); @@ -191,7 +190,6 @@ public static AutoMLExperiment SetPerformanceMonitor(this A /// public static AutoMLExperiment SetPerformanceMonitor(this AutoMLExperiment experiment) where TPerformanceMonitor : class, IPerformanceMonitor - { experiment.ServiceCollection.AddTransient(); From fc82c4c924511a5049357283b7bc672924fee243 Mon Sep 17 00:00:00 2001 From: Andras Fuchs Date: Fri, 6 Jan 2023 10:41:50 +0100 Subject: [PATCH 12/24] Pause the performance monitor if the trial is not running --- .../AutoMLExperiment/AutoMLExperiment.cs | 1 + .../AutoMLExperiment/IPerformanceMonitor.cs | 13 ++++++++++++- .../AutoMLExperimentTests.cs | 7 ++++++- 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs b/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs index 2af5414fa2..e0a1070da9 100644 --- a/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs +++ b/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs @@ -278,6 +278,7 @@ void handler(object o, EventArgs e) trialResult.PeakCpu = peakCpu; trialResult.PeakMemoryInMegaByte = peakMemoryInMB; + performanceMonitor.Pause(); monitor?.ReportCompletedTrial(trialResult); tuner.Update(trialResult); trialResultManager?.AddOrUpdateTrialResult(trialResult); diff --git a/src/Microsoft.ML.AutoML/AutoMLExperiment/IPerformanceMonitor.cs b/src/Microsoft.ML.AutoML/AutoMLExperiment/IPerformanceMonitor.cs index 1e0d07d3b3..1be581c49a 100644 --- a/src/Microsoft.ML.AutoML/AutoMLExperiment/IPerformanceMonitor.cs +++ b/src/Microsoft.ML.AutoML/AutoMLExperiment/IPerformanceMonitor.cs @@ -20,6 +20,8 @@ public interface IPerformanceMonitor : IDisposable { void Start(); + void Pause(); + void Stop(); double? GetPeakMemoryUsageInMegaByte(); @@ -79,9 +81,18 @@ public void Start() _totalCpuProcessorTime = Process.GetCurrentProcess().TotalProcessorTime; _timer.Elapsed += OnCheckCpuAndMemoryUsage; _timer.AutoReset = true; - _timer.Enabled = true; _logger?.Trace($"{typeof(DefaultPerformanceMonitor)} has been started"); } + + // trigger the PerformanceMetricsUpdated event and (re)start the timer + _timer.Enabled = false; + SampleCpuAndMemoryUsage(); + _timer.Enabled = true; + } + + public void Pause() + { + _timer.Enabled = false; } public void Stop() diff --git a/test/Microsoft.ML.AutoML.Tests/AutoMLExperimentTests.cs b/test/Microsoft.ML.AutoML.Tests/AutoMLExperimentTests.cs index d22c654403..f214dde638 100644 --- a/test/Microsoft.ML.AutoML.Tests/AutoMLExperimentTests.cs +++ b/test/Microsoft.ML.AutoML.Tests/AutoMLExperimentTests.cs @@ -437,8 +437,13 @@ public void Start() }; _timer.AutoReset = true; - _timer.Enabled = true; } + _timer.Enabled = true; + } + + public void Pause() + { + _timer.Enabled = false; } public void Stop() From d0ce0cd60f5a53cef77b0c6ae9f65f308700b5ff Mon Sep 17 00:00:00 2001 From: Andras Fuchs Date: Sun, 8 Jan 2023 19:24:23 +0100 Subject: [PATCH 13/24] Add StartedAtUtc and EndedAtUtc to TrialSettings --- .../AutoMLExperiment/AutoMLExperiment.cs | 4 ++++ src/Microsoft.ML.AutoML/AutoMLExperiment/TrialSettings.cs | 8 ++++++++ 2 files changed, 12 insertions(+) diff --git a/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs b/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs index e0a1070da9..0d80f74176 100644 --- a/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs +++ b/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs @@ -241,6 +241,7 @@ public async Task RunAsync(CancellationToken ct = default) { TrialId = trialNum++, Parameter = Parameter.CreateNestedParameter(), + StartedAtUtc = DateTime.UtcNow, }; var parameter = tuner.Propose(trialSettings); trialSettings.Parameter = parameter; @@ -277,6 +278,7 @@ void handler(object o, EventArgs e) var peakMemoryInMB = performanceMonitor?.GetPeakMemoryUsageInMegaByte(); trialResult.PeakCpu = peakCpu; trialResult.PeakMemoryInMegaByte = peakMemoryInMB; + trialResult.TrialSettings.EndedAtUtc = DateTime.UtcNow; performanceMonitor.Pause(); monitor?.ReportCompletedTrial(trialResult); @@ -295,6 +297,7 @@ void handler(object o, EventArgs e) } catch (OperationCanceledException ex) when (aggregateTrainingStopManager.IsStopTrainingRequested() == false) { + trialSettings.EndedAtUtc = DateTime.UtcNow; monitor?.ReportFailTrial(trialSettings, ex); var result = new TrialResult { @@ -311,6 +314,7 @@ void handler(object o, EventArgs e) } catch (Exception ex) { + trialSettings.EndedAtUtc = DateTime.UtcNow; monitor?.ReportFailTrial(trialSettings, ex); if (!aggregateTrainingStopManager.IsStopTrainingRequested() && _bestTrialResult == null) diff --git a/src/Microsoft.ML.AutoML/AutoMLExperiment/TrialSettings.cs b/src/Microsoft.ML.AutoML/AutoMLExperiment/TrialSettings.cs index 9980fc1366..f86bb58bfe 100644 --- a/src/Microsoft.ML.AutoML/AutoMLExperiment/TrialSettings.cs +++ b/src/Microsoft.ML.AutoML/AutoMLExperiment/TrialSettings.cs @@ -20,6 +20,14 @@ public class TrialSettings /// public int TrialId { get; set; } /// + /// UTC time when the trial started + /// + public DateTime StartedAtUtc { get; set; } + /// + /// UTC time when the trial ended, null if it's still running + /// + public DateTime? EndedAtUtc { get; set; } + /// /// Parameters for the pipeline used in this trial /// public Parameter Parameter { get; set; } From 4149a4b5e4606edcb7bc1e1b92948eecd58ca96e Mon Sep 17 00:00:00 2001 From: LittleLittleCloud Date: Mon, 6 Feb 2023 14:55:26 -0800 Subject: [PATCH 14/24] cancel trial when as is --- .../AutoMLExperiment/AutoMLExperiment.cs | 5 +---- .../AutoMLExperiment/IStopTrainingManager.cs | 13 ++++++++++++- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs b/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs index 0d80f74176..58ab1ec9f2 100644 --- a/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs +++ b/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs @@ -252,10 +252,7 @@ public async Task RunAsync(CancellationToken ct = default) void handler(object o, EventArgs e) { - // only force-canceling running trials when there's completed trials. - // otherwise, wait for the current running trial to be completed. - if (_bestTrialResult != null) - trialCancellationTokenSource.Cancel(); + trialCancellationTokenSource.Cancel(); } try { diff --git a/src/Microsoft.ML.AutoML/AutoMLExperiment/IStopTrainingManager.cs b/src/Microsoft.ML.AutoML/AutoMLExperiment/IStopTrainingManager.cs index e49ceef4ee..7f6b94641a 100644 --- a/src/Microsoft.ML.AutoML/AutoMLExperiment/IStopTrainingManager.cs +++ b/src/Microsoft.ML.AutoML/AutoMLExperiment/IStopTrainingManager.cs @@ -140,10 +140,21 @@ public void AddTrainingStopManager(IStopTrainingManager manager) _managers.Add(manager); manager.OnStopTraining += (o, e) => { - OnStopTraining?.Invoke(this, e); + if (_managers.Exists(manager.Equals)) + { + OnStopTraining?.Invoke(this, e); + } }; } + public void RemoveTrainingStopManagerIfExist(IStopTrainingManager manager) + { + if (_managers.Exists(manager.Equals)) + { + _managers.RemoveAll(manager.Equals); + } + } + public void Update(TrialResult result) { foreach (var manager in _managers) From 7d3257af7d829e70d8508306b119cd2c9d6f0aa4 Mon Sep 17 00:00:00 2001 From: LittleLittleCloud Date: Mon, 6 Feb 2023 15:17:01 -0800 Subject: [PATCH 15/24] fix tests --- test/Microsoft.ML.AutoML.Tests/AutoFitTests.cs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/Microsoft.ML.AutoML.Tests/AutoFitTests.cs b/test/Microsoft.ML.AutoML.Tests/AutoFitTests.cs index 8986b92dfb..432895c441 100644 --- a/test/Microsoft.ML.AutoML.Tests/AutoFitTests.cs +++ b/test/Microsoft.ML.AutoML.Tests/AutoFitTests.cs @@ -131,7 +131,7 @@ public void AutoFit_Taxi_Fare_Train_Test_Split_Test() var label = "fare_amount"; var settings = new RegressionExperimentSettings { - MaxExperimentTimeInSeconds = 1, + MaxModels = 1, }; settings.Trainers.Remove(RegressionTrainer.LightGbm); settings.Trainers.Remove(RegressionTrainer.StochasticDualCoordinateAscent); @@ -161,7 +161,7 @@ public void AutoFit_Taxi_Fare_CrossValidation_10_Test() var label = "fare_amount"; var settings = new RegressionExperimentSettings { - MaxExperimentTimeInSeconds = 1, + MaxModels = 1, }; settings.Trainers.Remove(RegressionTrainer.LightGbm); settings.Trainers.Remove(RegressionTrainer.StochasticDualCoordinateAscent); @@ -191,7 +191,7 @@ public void AutoFit_Taxi_Fare_Test() var label = "fare_amount"; var settings = new RegressionExperimentSettings { - MaxExperimentTimeInSeconds = 1, + MaxModels = 1, }; settings.Trainers.Remove(RegressionTrainer.LightGbm); settings.Trainers.Remove(RegressionTrainer.StochasticDualCoordinateAscent); From 3919324c3fd258b30d8f6e6a026729ac22cfd372 Mon Sep 17 00:00:00 2001 From: LittleLittleCloud Date: Mon, 6 Feb 2023 16:26:02 -0800 Subject: [PATCH 16/24] fix tests --- test/Microsoft.ML.AutoML.Tests/AutoMLExperimentTests.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/Microsoft.ML.AutoML.Tests/AutoMLExperimentTests.cs b/test/Microsoft.ML.AutoML.Tests/AutoMLExperimentTests.cs index d59a111f47..7a15a4d1ba 100644 --- a/test/Microsoft.ML.AutoML.Tests/AutoMLExperimentTests.cs +++ b/test/Microsoft.ML.AutoML.Tests/AutoMLExperimentTests.cs @@ -73,7 +73,7 @@ public async Task AutoMLExperiment_cancel_trial_when_exceeds_memory_limit_Async( // the following experiment set memory usage limit to 0.01mb // so all trials should be canceled and there should be no successful trials. // therefore when experiment finishes, it should throw timeout exception with no model trained message. - experiment.SetTrainingTimeInSeconds(10) + experiment.SetMaxModelToExplore(10) .SetTrialRunner((serviceProvider) => { var channel = serviceProvider.GetService(); From 13ba949092423398c3bcad6a45635ab4fa9b5406 Mon Sep 17 00:00:00 2001 From: LittleLittleCloud Date: Tue, 7 Feb 2023 12:02:31 -0800 Subject: [PATCH 17/24] fix tests --- .../AutoMLExperiment/AutoMLExperiment.cs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs b/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs index 58ab1ec9f2..9727e2039f 100644 --- a/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs +++ b/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs @@ -296,13 +296,15 @@ void handler(object o, EventArgs e) { trialSettings.EndedAtUtc = DateTime.UtcNow; monitor?.ReportFailTrial(trialSettings, ex); - var result = new TrialResult + var trialResult = new TrialResult { TrialSettings = trialSettings, Loss = double.MaxValue, }; - tuner.Update(result); + tuner.Update(trialResult); + trialResultManager?.AddOrUpdateTrialResult(trialResult); + aggregateTrainingStopManager.Update(trialResult); continue; } catch (OperationCanceledException) when (aggregateTrainingStopManager.IsStopTrainingRequested()) From 488ff2053a8c72bb59a6ed60ab11335bb1792f89 Mon Sep 17 00:00:00 2001 From: LittleLittleCloud Date: Tue, 7 Feb 2023 23:47:23 -0800 Subject: [PATCH 18/24] use workingset to evaluate memory usage --- .../AutoMLExperiment/IPerformanceMonitor.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Microsoft.ML.AutoML/AutoMLExperiment/IPerformanceMonitor.cs b/src/Microsoft.ML.AutoML/AutoMLExperiment/IPerformanceMonitor.cs index 1be581c49a..387aab2da2 100644 --- a/src/Microsoft.ML.AutoML/AutoMLExperiment/IPerformanceMonitor.cs +++ b/src/Microsoft.ML.AutoML/AutoMLExperiment/IPerformanceMonitor.cs @@ -127,8 +127,8 @@ private void SampleCpuAndMemoryUsage() _peakCpuUsage = Math.Max(cpuUsageInTotal, _peakCpuUsage ?? 0); // calculate Memory Usage in MB - var memoryUsage = process.PrivateMemorySize64 * 1.0 / (1024 * 1024); - _peakMemoryUsage = Math.Max(memoryUsage, _peakMemoryUsage ?? 0); + var memoryUsage = process.WorkingSet64 * 1.0 / (1024 * 1024); + _peakMemoryUsage = process.PeakWorkingSet64 * 1.0 / (1024 * 1024); var metrics = new TrialPerformanceMetrics() { From 49ac8ae2d986c9e507bcc4835b410371239f68ed Mon Sep 17 00:00:00 2001 From: LittleLittleCloud Date: Wed, 8 Feb 2023 10:22:11 -0800 Subject: [PATCH 19/24] remove handler --- .../AutoMLExperiment/AutoMLExperiment.cs | 10 ---------- .../Microsoft.ML.AutoML.Tests/AutoMLExperimentTests.cs | 4 +--- 2 files changed, 1 insertion(+), 13 deletions(-) diff --git a/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs b/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs index 9727e2039f..194781c897 100644 --- a/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs +++ b/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs @@ -250,17 +250,11 @@ public async Task RunAsync(CancellationToken ct = default) { monitor?.ReportRunningTrial(trialSettings); - void handler(object o, EventArgs e) - { - trialCancellationTokenSource.Cancel(); - } try { using (var performanceMonitor = serviceProvider.GetService()) using (var runner = serviceProvider.GetRequiredService()) { - aggregateTrainingStopManager.OnStopTraining += handler; - performanceMonitor.PerformanceMetricsUpdated += (o, metrics) => { performanceMonitor.OnPerformanceMetricsUpdatedHandler(trialSettings, metrics, trialCancellationTokenSource); @@ -325,10 +319,6 @@ void handler(object o, EventArgs e) throw; } } - finally - { - aggregateTrainingStopManager.OnStopTraining -= handler; - } } } diff --git a/test/Microsoft.ML.AutoML.Tests/AutoMLExperimentTests.cs b/test/Microsoft.ML.AutoML.Tests/AutoMLExperimentTests.cs index 7a15a4d1ba..8e073394c6 100644 --- a/test/Microsoft.ML.AutoML.Tests/AutoMLExperimentTests.cs +++ b/test/Microsoft.ML.AutoML.Tests/AutoMLExperimentTests.cs @@ -365,13 +365,11 @@ public void AutoMLExperiment_should_use_seed_from_context_if_provided() class DummyTrialRunner : ITrialRunner { private readonly int _finishAfterNSeconds; - private readonly CancellationToken _ct; private readonly IChannel _logger; public DummyTrialRunner(AutoMLExperiment.AutoMLExperimentSettings automlSettings, int finishAfterNSeconds, IChannel logger) { _finishAfterNSeconds = finishAfterNSeconds; - _ct = automlSettings.CancellationToken; _logger = logger; } @@ -383,7 +381,7 @@ public async Task RunAsync(TrialSettings settings, CancellationToke { _logger.Info("Update Running Trial"); await Task.Delay(_finishAfterNSeconds * 1000, ct); - _ct.ThrowIfCancellationRequested(); + ct.ThrowIfCancellationRequested(); _logger.Info("Update Completed Trial"); var metric = 1.000 + 0.01 * settings.TrialId; return new TrialResult From 3722dcbe4ab6e07883ef26e2c2063fbcdb690765 Mon Sep 17 00:00:00 2001 From: LittleLittleCloud Date: Wed, 8 Feb 2023 10:35:41 -0800 Subject: [PATCH 20/24] add handler back --- .../AutoMLExperiment/AutoMLExperiment.cs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs b/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs index 194781c897..6ee171dd3e 100644 --- a/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs +++ b/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs @@ -250,11 +250,16 @@ public async Task RunAsync(CancellationToken ct = default) { monitor?.ReportRunningTrial(trialSettings); + void handler(object o, EventArgs e) + { + trialCancellationTokenSource.Cancel(); + } try { using (var performanceMonitor = serviceProvider.GetService()) using (var runner = serviceProvider.GetRequiredService()) { + aggregateTrainingStopManager.OnStopTraining += handler; performanceMonitor.PerformanceMetricsUpdated += (o, metrics) => { performanceMonitor.OnPerformanceMetricsUpdatedHandler(trialSettings, metrics, trialCancellationTokenSource); @@ -319,6 +324,10 @@ public async Task RunAsync(CancellationToken ct = default) throw; } } + finally + { + aggregateTrainingStopManager.OnStopTraining -= handler; + } } } From ff55857d2f5ea684c9e4314e2fcca8161952ad80 Mon Sep 17 00:00:00 2001 From: LittleLittleCloud Date: Wed, 8 Feb 2023 10:47:24 -0800 Subject: [PATCH 21/24] add more logging --- .../AutoMLExperiment/AutoMLExperiment.cs | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs b/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs index 6ee171dd3e..616b8d2696 100644 --- a/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs +++ b/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs @@ -293,6 +293,7 @@ void handler(object o, EventArgs e) } catch (OperationCanceledException ex) when (aggregateTrainingStopManager.IsStopTrainingRequested() == false) { + logger.Trace($"trial cancelled - {JsonSerializer.Serialize(trialSettings)}, continue training"); trialSettings.EndedAtUtc = DateTime.UtcNow; monitor?.ReportFailTrial(trialSettings, ex); var trialResult = new TrialResult @@ -308,15 +309,21 @@ void handler(object o, EventArgs e) } catch (OperationCanceledException) when (aggregateTrainingStopManager.IsStopTrainingRequested()) { + logger.Trace($"trial cancelled - {JsonSerializer.Serialize(trialSettings)}, stop training"); + break; } catch (Exception ex) { + logger.Trace($"trial failed - {JsonSerializer.Serialize(trialSettings)}, stop training"); + trialSettings.EndedAtUtc = DateTime.UtcNow; monitor?.ReportFailTrial(trialSettings, ex); if (!aggregateTrainingStopManager.IsStopTrainingRequested() && _bestTrialResult == null) { + logger.Trace($"trial fatal error - {JsonSerializer.Serialize(trialSettings)}, stop training"); + // TODO // it's questionable on whether to abort the entire training process // for a single fail trial. We should make it an option and only exit From 509f9637748bf5968c145f99e792ec143fd4f571 Mon Sep 17 00:00:00 2001 From: LittleLittleCloud Date: Wed, 8 Feb 2023 13:29:51 -0800 Subject: [PATCH 22/24] add more logger --- src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs b/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs index 616b8d2696..ab89bfcf13 100644 --- a/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs +++ b/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs @@ -265,10 +265,9 @@ void handler(object o, EventArgs e) performanceMonitor.OnPerformanceMetricsUpdatedHandler(trialSettings, metrics, trialCancellationTokenSource); }; - var trialTask = runner.RunAsync(trialSettings, trialCancellationTokenSource.Token); performanceMonitor.Start(); logger.Trace($"trial setting - {JsonSerializer.Serialize(trialSettings)}"); - var trialResult = await trialTask; + var trialResult = await runner.RunAsync(trialSettings, trialCancellationTokenSource.Token); var peakCpu = performanceMonitor?.GetPeakCpuUsage(); var peakMemoryInMB = performanceMonitor?.GetPeakMemoryUsageInMegaByte(); From 1240335f474aa6d104af64b36bedb5c4a1128144 Mon Sep 17 00:00:00 2001 From: LittleLittleCloud Date: Thu, 9 Feb 2023 09:59:22 -0800 Subject: [PATCH 23/24] add logging --- src/Microsoft.ML.AutoML/AutoMLExperiment/IPerformanceMonitor.cs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Microsoft.ML.AutoML/AutoMLExperiment/IPerformanceMonitor.cs b/src/Microsoft.ML.AutoML/AutoMLExperiment/IPerformanceMonitor.cs index 387aab2da2..c27e8ff0fd 100644 --- a/src/Microsoft.ML.AutoML/AutoMLExperiment/IPerformanceMonitor.cs +++ b/src/Microsoft.ML.AutoML/AutoMLExperiment/IPerformanceMonitor.cs @@ -146,6 +146,7 @@ private void SampleCpuAndMemoryUsage() public virtual void OnPerformanceMetricsUpdatedHandler(TrialSettings trialSettings, TrialPerformanceMetrics metrics, CancellationTokenSource trialCancellationTokenSource) { + _logger.Trace($"maximum memory usage: {_settings.MaximumMemoryUsageInMegaByte}, PeakMemoryUsage: {metrics.PeakMemoryUsage} trialIsCancelled: {trialCancellationTokenSource.IsCancellationRequested}"); if (_settings.MaximumMemoryUsageInMegaByte is double d && metrics.PeakMemoryUsage > d && !trialCancellationTokenSource.IsCancellationRequested) { _logger.Trace($"cancel current trial {trialSettings.TrialId} because it uses {metrics.PeakMemoryUsage} mb memory and the maximum memory usage is {d}"); From 5a27af4539314ec703db2fcce3966c0b40cd95d6 Mon Sep 17 00:00:00 2001 From: LittleLittleCloud Date: Thu, 9 Feb 2023 10:31:19 -0800 Subject: [PATCH 24/24] fix tests --- src/Microsoft.ML.AutoML/AutoMLExperiment/IPerformanceMonitor.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Microsoft.ML.AutoML/AutoMLExperiment/IPerformanceMonitor.cs b/src/Microsoft.ML.AutoML/AutoMLExperiment/IPerformanceMonitor.cs index c27e8ff0fd..a3a0601d66 100644 --- a/src/Microsoft.ML.AutoML/AutoMLExperiment/IPerformanceMonitor.cs +++ b/src/Microsoft.ML.AutoML/AutoMLExperiment/IPerformanceMonitor.cs @@ -128,7 +128,7 @@ private void SampleCpuAndMemoryUsage() // calculate Memory Usage in MB var memoryUsage = process.WorkingSet64 * 1.0 / (1024 * 1024); - _peakMemoryUsage = process.PeakWorkingSet64 * 1.0 / (1024 * 1024); + _peakMemoryUsage = Math.Max(memoryUsage, _peakMemoryUsage ?? 0); var metrics = new TrialPerformanceMetrics() {