diff --git a/src/Microsoft.ML.AutoML/API/AutoMLExperimentExtension.cs b/src/Microsoft.ML.AutoML/API/AutoMLExperimentExtension.cs index b2437588d0..35d0a679ad 100644 --- a/src/Microsoft.ML.AutoML/API/AutoMLExperimentExtension.cs +++ b/src/Microsoft.ML.AutoML/API/AutoMLExperimentExtension.cs @@ -149,18 +149,53 @@ public static AutoMLExperiment SetPipeline(this AutoMLExperiment experiment, Swe return experiment; } + /// + /// Set as for . + /// + /// + /// the interval in milliseconds for to sample + /// public static AutoMLExperiment SetPerformanceMonitor(this AutoMLExperiment experiment, int checkIntervalInMilliseconds = 1000) { experiment.SetPerformanceMonitor((service) => { var channel = service.GetService(); - - return new DefaultPerformanceMonitor(channel, checkIntervalInMilliseconds); + var settings = service.GetRequiredService(); + return new DefaultPerformanceMonitor(settings, channel, checkIntervalInMilliseconds); }); return experiment; } + /// + /// Set a custom performance monitor as for . + /// + /// + /// + /// + /// + public static AutoMLExperiment SetPerformanceMonitor(this AutoMLExperiment experiment, Func factory) + where TPerformanceMonitor : class, IPerformanceMonitor + { + experiment.ServiceCollection.AddTransient(factory); + + return experiment; + } + + /// + /// Set a custom performance monitor as for . + /// + /// + /// + /// + public static AutoMLExperiment SetPerformanceMonitor(this AutoMLExperiment experiment) + where TPerformanceMonitor : class, IPerformanceMonitor + { + experiment.ServiceCollection.AddTransient(); + + return experiment; + } + /// /// Set as tuner for hyper-parameter optimization. The performance of smac is in a large extend determined /// by , and , which are used to fit smac's inner diff --git a/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs b/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs index 60d2790f28..ab89bfcf13 100644 --- a/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs +++ b/src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs @@ -3,6 +3,7 @@ // See the LICENSE file in the project root for more information. using System; +using System.Diagnostics; using System.Linq; using System.Text.Json; using System.Threading; @@ -193,22 +194,6 @@ public AutoMLExperiment SetTuner() return this; } - internal AutoMLExperiment SetPerformanceMonitor() - where TPerformanceMonitor : class, IPerformanceMonitor - { - _serviceCollection.AddTransient(); - - return this; - } - - internal AutoMLExperiment SetPerformanceMonitor(Func factory) - where TPerformanceMonitor : class, IPerformanceMonitor - { - _serviceCollection.AddTransient(factory); - - return this; - } - /// /// Run experiment and return the best trial result synchronizely. /// @@ -249,25 +234,25 @@ public async Task RunAsync(CancellationToken ct = default) var trialNum = trialResultManager?.GetAllTrialResults().Max(t => t.TrialSettings?.TrialId) + 1 ?? 0; var tuner = serviceProvider.GetService(); Contracts.Assert(tuner != null, "tuner can't be null"); + while (!aggregateTrainingStopManager.IsStopTrainingRequested()) { - var setting = new TrialSettings() + var trialSettings = new TrialSettings() { TrialId = trialNum++, Parameter = Parameter.CreateNestedParameter(), + StartedAtUtc = DateTime.UtcNow, }; - var parameter = tuner.Propose(setting); - setting.Parameter = parameter; + var parameter = tuner.Propose(trialSettings); + trialSettings.Parameter = parameter; - monitor?.ReportRunningTrial(setting); using (var trialCancellationTokenSource = new CancellationTokenSource()) { + monitor?.ReportRunningTrial(trialSettings); + void handler(object o, EventArgs e) { - // only force-canceling running trials when there's completed trials. - // otherwise, wait for the current running trial to be completed. - if (_bestTrialResult != null) - trialCancellationTokenSource.Cancel(); + trialCancellationTokenSource.Cancel(); } try { @@ -275,28 +260,22 @@ void handler(object o, EventArgs e) using (var runner = serviceProvider.GetRequiredService()) { aggregateTrainingStopManager.OnStopTraining += handler; - - performanceMonitor.MemoryUsageInMegaByte += (o, m) => + performanceMonitor.PerformanceMetricsUpdated += (o, metrics) => { - if (_settings.MaximumMemoryUsageInMegaByte is double d && m > d && !trialCancellationTokenSource.IsCancellationRequested) - { - logger.Trace($"cancel current trial {setting.TrialId} because it uses {m} mb memory and the maximum memory usage is {d}"); - trialCancellationTokenSource.Cancel(); - - GC.AddMemoryPressure(Convert.ToInt64(m) * 1024 * 1024); - GC.Collect(); - } + performanceMonitor.OnPerformanceMetricsUpdatedHandler(trialSettings, metrics, trialCancellationTokenSource); }; performanceMonitor.Start(); - logger.Trace($"trial setting - {JsonSerializer.Serialize(setting)}"); - var trialResult = await runner.RunAsync(setting, trialCancellationTokenSource.Token); + logger.Trace($"trial setting - {JsonSerializer.Serialize(trialSettings)}"); + var trialResult = await runner.RunAsync(trialSettings, trialCancellationTokenSource.Token); var peakCpu = performanceMonitor?.GetPeakCpuUsage(); var peakMemoryInMB = performanceMonitor?.GetPeakMemoryUsageInMegaByte(); trialResult.PeakCpu = peakCpu; trialResult.PeakMemoryInMegaByte = peakMemoryInMB; + trialResult.TrialSettings.EndedAtUtc = DateTime.UtcNow; + performanceMonitor.Pause(); monitor?.ReportCompletedTrial(trialResult); tuner.Update(trialResult); trialResultManager?.AddOrUpdateTrialResult(trialResult); @@ -313,26 +292,37 @@ void handler(object o, EventArgs e) } catch (OperationCanceledException ex) when (aggregateTrainingStopManager.IsStopTrainingRequested() == false) { - monitor?.ReportFailTrial(setting, ex); - var result = new TrialResult + logger.Trace($"trial cancelled - {JsonSerializer.Serialize(trialSettings)}, continue training"); + trialSettings.EndedAtUtc = DateTime.UtcNow; + monitor?.ReportFailTrial(trialSettings, ex); + var trialResult = new TrialResult { - TrialSettings = setting, + TrialSettings = trialSettings, Loss = double.MaxValue, }; - tuner.Update(result); + tuner.Update(trialResult); + trialResultManager?.AddOrUpdateTrialResult(trialResult); + aggregateTrainingStopManager.Update(trialResult); continue; } catch (OperationCanceledException) when (aggregateTrainingStopManager.IsStopTrainingRequested()) { + logger.Trace($"trial cancelled - {JsonSerializer.Serialize(trialSettings)}, stop training"); + break; } catch (Exception ex) { - monitor?.ReportFailTrial(setting, ex); + logger.Trace($"trial failed - {JsonSerializer.Serialize(trialSettings)}, stop training"); + + trialSettings.EndedAtUtc = DateTime.UtcNow; + monitor?.ReportFailTrial(trialSettings, ex); if (!aggregateTrainingStopManager.IsStopTrainingRequested() && _bestTrialResult == null) { + logger.Trace($"trial fatal error - {JsonSerializer.Serialize(trialSettings)}, stop training"); + // TODO // it's questionable on whether to abort the entire training process // for a single fail trial. We should make it an option and only exit @@ -343,7 +333,6 @@ void handler(object o, EventArgs e) finally { aggregateTrainingStopManager.OnStopTraining -= handler; - } } } diff --git a/src/Microsoft.ML.AutoML/AutoMLExperiment/IMonitor.cs b/src/Microsoft.ML.AutoML/AutoMLExperiment/IMonitor.cs index 6a28d80010..51335f8d94 100644 --- a/src/Microsoft.ML.AutoML/AutoMLExperiment/IMonitor.cs +++ b/src/Microsoft.ML.AutoML/AutoMLExperiment/IMonitor.cs @@ -19,7 +19,7 @@ public interface IMonitor void ReportFailTrial(TrialSettings settings, Exception exception = null); - void ReportRunningTrial(TrialSettings setting); + void ReportRunningTrial(TrialSettings settings); } /// @@ -30,6 +30,7 @@ internal class MLContextMonitor : IMonitor private readonly IChannel _logger; private readonly List _completedTrials; private readonly SweepablePipeline _pipeline; + public MLContextMonitor(IChannel logger, SweepablePipeline pipeline) { _logger = logger; @@ -55,7 +56,7 @@ public virtual void ReportFailTrial(TrialSettings settings, Exception exception public virtual void ReportRunningTrial(TrialSettings setting) { - _logger.Info($"Update Running Trial - Id: {setting.TrialId} - Pipeline: {_pipeline.ToString(setting.Parameter)}"); + _logger.Info($"Update Running Trial - Id: {setting.TrialId}"); } } diff --git a/src/Microsoft.ML.AutoML/AutoMLExperiment/IPerformanceMonitor.cs b/src/Microsoft.ML.AutoML/AutoMLExperiment/IPerformanceMonitor.cs index fa86bb5894..a3a0601d66 100644 --- a/src/Microsoft.ML.AutoML/AutoMLExperiment/IPerformanceMonitor.cs +++ b/src/Microsoft.ML.AutoML/AutoMLExperiment/IPerformanceMonitor.cs @@ -5,47 +5,57 @@ using System; using System.Collections.Generic; using System.Diagnostics; +using System.IO; +using System.Linq; using System.Text; +using System.Threading; using System.Threading.Tasks; using System.Timers; using Microsoft.ML.Runtime; +using Timer = System.Timers.Timer; namespace Microsoft.ML.AutoML { - internal interface IPerformanceMonitor : IDisposable + public interface IPerformanceMonitor : IDisposable { void Start(); + void Pause(); + void Stop(); double? GetPeakMemoryUsageInMegaByte(); double? GetPeakCpuUsage(); - public event EventHandler CpuUsage; + /// + /// The handler function every time get fired. + /// + void OnPerformanceMetricsUpdatedHandler(TrialSettings trialSettings, TrialPerformanceMetrics metrics, CancellationTokenSource trialCancellationTokenSource); + - public event EventHandler MemoryUsageInMegaByte; + public event EventHandler PerformanceMetricsUpdated; } - internal class DefaultPerformanceMonitor : IPerformanceMonitor + public class DefaultPerformanceMonitor : IPerformanceMonitor { private readonly IChannel _logger; + private readonly AutoMLExperiment.AutoMLExperimentSettings _settings; private Timer _timer; private double? _peakCpuUsage; private double? _peakMemoryUsage; private readonly int _checkIntervalInMilliseconds; private TimeSpan _totalCpuProcessorTime; - public DefaultPerformanceMonitor(IChannel logger, int checkIntervalInMilliseconds) + public DefaultPerformanceMonitor(AutoMLExperiment.AutoMLExperimentSettings settings, IChannel logger, int checkIntervalInMilliseconds) { + _settings = settings; _logger = logger; _checkIntervalInMilliseconds = checkIntervalInMilliseconds; } - public event EventHandler CpuUsage; - - public event EventHandler MemoryUsageInMegaByte; + public event EventHandler PerformanceMetricsUpdated; public void Dispose() @@ -71,9 +81,18 @@ public void Start() _totalCpuProcessorTime = Process.GetCurrentProcess().TotalProcessorTime; _timer.Elapsed += OnCheckCpuAndMemoryUsage; _timer.AutoReset = true; - _timer.Enabled = true; _logger?.Trace($"{typeof(DefaultPerformanceMonitor)} has been started"); } + + // trigger the PerformanceMetricsUpdated event and (re)start the timer + _timer.Enabled = false; + SampleCpuAndMemoryUsage(); + _timer.Enabled = true; + } + + public void Pause() + { + _timer.Enabled = false; } public void Stop() @@ -108,11 +127,33 @@ private void SampleCpuAndMemoryUsage() _peakCpuUsage = Math.Max(cpuUsageInTotal, _peakCpuUsage ?? 0); // calculate Memory Usage in MB - var memoryUsage = process.PrivateMemorySize64 * 1.0 / (1024 * 1024); + var memoryUsage = process.WorkingSet64 * 1.0 / (1024 * 1024); _peakMemoryUsage = Math.Max(memoryUsage, _peakMemoryUsage ?? 0); + + var metrics = new TrialPerformanceMetrics() + { + CpuUsage = cpuUsageInTotal, + MemoryUsage = memoryUsage, + PeakCpuUsage = _peakCpuUsage, + PeakMemoryUsage = _peakMemoryUsage + }; + _logger?.Trace($"current CPU: {cpuUsageInTotal}, current Memory(mb): {memoryUsage}"); - MemoryUsageInMegaByte?.Invoke(this, memoryUsage); - CpuUsage?.Invoke(this, cpuUsageInTotal); + + PerformanceMetricsUpdated?.Invoke(this, metrics); + } + } + + public virtual void OnPerformanceMetricsUpdatedHandler(TrialSettings trialSettings, TrialPerformanceMetrics metrics, CancellationTokenSource trialCancellationTokenSource) + { + _logger.Trace($"maximum memory usage: {_settings.MaximumMemoryUsageInMegaByte}, PeakMemoryUsage: {metrics.PeakMemoryUsage} trialIsCancelled: {trialCancellationTokenSource.IsCancellationRequested}"); + if (_settings.MaximumMemoryUsageInMegaByte is double d && metrics.PeakMemoryUsage > d && !trialCancellationTokenSource.IsCancellationRequested) + { + _logger.Trace($"cancel current trial {trialSettings.TrialId} because it uses {metrics.PeakMemoryUsage} mb memory and the maximum memory usage is {d}"); + trialCancellationTokenSource.Cancel(); + + GC.AddMemoryPressure(Convert.ToInt64(metrics.PeakMemoryUsage) * 1024 * 1024); + GC.Collect(); } } } diff --git a/src/Microsoft.ML.AutoML/AutoMLExperiment/IStopTrainingManager.cs b/src/Microsoft.ML.AutoML/AutoMLExperiment/IStopTrainingManager.cs index e49ceef4ee..7f6b94641a 100644 --- a/src/Microsoft.ML.AutoML/AutoMLExperiment/IStopTrainingManager.cs +++ b/src/Microsoft.ML.AutoML/AutoMLExperiment/IStopTrainingManager.cs @@ -140,10 +140,21 @@ public void AddTrainingStopManager(IStopTrainingManager manager) _managers.Add(manager); manager.OnStopTraining += (o, e) => { - OnStopTraining?.Invoke(this, e); + if (_managers.Exists(manager.Equals)) + { + OnStopTraining?.Invoke(this, e); + } }; } + public void RemoveTrainingStopManagerIfExist(IStopTrainingManager manager) + { + if (_managers.Exists(manager.Equals)) + { + _managers.RemoveAll(manager.Equals); + } + } + public void Update(TrialResult result) { foreach (var manager in _managers) diff --git a/src/Microsoft.ML.AutoML/AutoMLExperiment/TrialPerformanceMetrics.cs b/src/Microsoft.ML.AutoML/AutoMLExperiment/TrialPerformanceMetrics.cs new file mode 100644 index 0000000000..e35ca8d47a --- /dev/null +++ b/src/Microsoft.ML.AutoML/AutoMLExperiment/TrialPerformanceMetrics.cs @@ -0,0 +1,33 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using System; +using System.Collections.Generic; +using System.Text; + +namespace Microsoft.ML.AutoML +{ + /// + /// Performance metrics for a trial. + /// + public class TrialPerformanceMetrics + { + /// + /// Peak memory usage during the trial in megabytes + /// + public double? PeakMemoryUsage { get; set; } + /// + /// Peak CPU usage during the trial + /// + public double? PeakCpuUsage { get; set; } + /// + /// Current CPU usage of the runner process + /// + public double CpuUsage { get; internal set; } + /// + /// Current memory usage of the runner process in megabytes + /// + public double MemoryUsage { get; internal set; } + } +} diff --git a/src/Microsoft.ML.AutoML/AutoMLExperiment/TrialSettings.cs b/src/Microsoft.ML.AutoML/AutoMLExperiment/TrialSettings.cs index 38d14c2a6d..f86bb58bfe 100644 --- a/src/Microsoft.ML.AutoML/AutoMLExperiment/TrialSettings.cs +++ b/src/Microsoft.ML.AutoML/AutoMLExperiment/TrialSettings.cs @@ -2,14 +2,34 @@ // The .NET Foundation licenses this file to you under the MIT license. // See the LICENSE file in the project root for more information. +using System; +using System.Runtime.Serialization; +using System.Text.Json.Serialization; +using System.Threading; using Microsoft.ML.SearchSpace; namespace Microsoft.ML.AutoML { + /// + /// Settings used for the trial + /// public class TrialSettings { + /// + /// Identifier of the trial + /// public int TrialId { get; set; } - + /// + /// UTC time when the trial started + /// + public DateTime StartedAtUtc { get; set; } + /// + /// UTC time when the trial ended, null if it's still running + /// + public DateTime? EndedAtUtc { get; set; } + /// + /// Parameters for the pipeline used in this trial + /// public Parameter Parameter { get; set; } } } diff --git a/src/Microsoft.ML.SearchSpace/Parameter.cs b/src/Microsoft.ML.SearchSpace/Parameter.cs index a2fb0406dd..58eeff0b6d 100644 --- a/src/Microsoft.ML.SearchSpace/Parameter.cs +++ b/src/Microsoft.ML.SearchSpace/Parameter.cs @@ -50,7 +50,7 @@ public enum ParameterType } /// - /// is used to save sweeping result from tuner and is used to restore mlnet pipeline from sweepable pipline. + /// is used to save sweeping result from tuner and is used to restore mlnet pipeline from sweepable pipeline. /// [JsonConverter(typeof(ParameterConverter))] public sealed class Parameter : IDictionary, IEquatable, IEqualityComparer diff --git a/test/Microsoft.ML.AutoML.Tests/AutoFitTests.cs b/test/Microsoft.ML.AutoML.Tests/AutoFitTests.cs index 8986b92dfb..432895c441 100644 --- a/test/Microsoft.ML.AutoML.Tests/AutoFitTests.cs +++ b/test/Microsoft.ML.AutoML.Tests/AutoFitTests.cs @@ -131,7 +131,7 @@ public void AutoFit_Taxi_Fare_Train_Test_Split_Test() var label = "fare_amount"; var settings = new RegressionExperimentSettings { - MaxExperimentTimeInSeconds = 1, + MaxModels = 1, }; settings.Trainers.Remove(RegressionTrainer.LightGbm); settings.Trainers.Remove(RegressionTrainer.StochasticDualCoordinateAscent); @@ -161,7 +161,7 @@ public void AutoFit_Taxi_Fare_CrossValidation_10_Test() var label = "fare_amount"; var settings = new RegressionExperimentSettings { - MaxExperimentTimeInSeconds = 1, + MaxModels = 1, }; settings.Trainers.Remove(RegressionTrainer.LightGbm); settings.Trainers.Remove(RegressionTrainer.StochasticDualCoordinateAscent); @@ -191,7 +191,7 @@ public void AutoFit_Taxi_Fare_Test() var label = "fare_amount"; var settings = new RegressionExperimentSettings { - MaxExperimentTimeInSeconds = 1, + MaxModels = 1, }; settings.Trainers.Remove(RegressionTrainer.LightGbm); settings.Trainers.Remove(RegressionTrainer.StochasticDualCoordinateAscent); diff --git a/test/Microsoft.ML.AutoML.Tests/AutoMLExperimentTests.cs b/test/Microsoft.ML.AutoML.Tests/AutoMLExperimentTests.cs index 18a0245718..8e073394c6 100644 --- a/test/Microsoft.ML.AutoML.Tests/AutoMLExperimentTests.cs +++ b/test/Microsoft.ML.AutoML.Tests/AutoMLExperimentTests.cs @@ -73,7 +73,7 @@ public async Task AutoMLExperiment_cancel_trial_when_exceeds_memory_limit_Async( // the following experiment set memory usage limit to 0.01mb // so all trials should be canceled and there should be no successful trials. // therefore when experiment finishes, it should throw timeout exception with no model trained message. - experiment.SetTrainingTimeInSeconds(10) + experiment.SetMaxModelToExplore(10) .SetTrialRunner((serviceProvider) => { var channel = serviceProvider.GetService(); @@ -81,8 +81,7 @@ public async Task AutoMLExperiment_cancel_trial_when_exceeds_memory_limit_Async( return new DummyTrialRunner(settings, 5, channel); }) .SetTuner() - .SetMaximumMemoryUsageInMegaByte(0.01) - .SetPerformanceMonitor(); + .SetMaximumMemoryUsageInMegaByte(0.01); var runExperimentAction = async () => await experiment.RunAsync(); await runExperimentAction.Should().ThrowExactlyAsync(); @@ -366,13 +365,11 @@ public void AutoMLExperiment_should_use_seed_from_context_if_provided() class DummyTrialRunner : ITrialRunner { private readonly int _finishAfterNSeconds; - private readonly CancellationToken _ct; private readonly IChannel _logger; public DummyTrialRunner(AutoMLExperiment.AutoMLExperimentSettings automlSettings, int finishAfterNSeconds, IChannel logger) { _finishAfterNSeconds = finishAfterNSeconds; - _ct = automlSettings.CancellationToken; _logger = logger; } @@ -384,7 +381,7 @@ public async Task RunAsync(TrialSettings settings, CancellationToke { _logger.Info("Update Running Trial"); await Task.Delay(_finishAfterNSeconds * 1000, ct); - _ct.ThrowIfCancellationRequested(); + ct.ThrowIfCancellationRequested(); _logger.Info("Update Completed Trial"); var metric = 1.000 + 0.01 * settings.TrialId; return new TrialResult @@ -407,9 +404,7 @@ public DummyPeformanceMonitor() _checkIntervalInMilliseconds = 1000; } - public event EventHandler CpuUsage; - - public event EventHandler MemoryUsageInMegaByte; + public event EventHandler PerformanceMetricsUpdated; public void Dispose() { @@ -425,6 +420,10 @@ public void Dispose() return 1000; } + public void OnPerformanceMetricsUpdatedHandler(TrialSettings trialSettings, TrialPerformanceMetrics metrics, CancellationTokenSource trialCancellationTokenSource) + { + } + public void Start() { if (_timer == null) @@ -432,13 +431,17 @@ public void Start() _timer = new System.Timers.Timer(_checkIntervalInMilliseconds); _timer.Elapsed += (o, e) => { - CpuUsage?.Invoke(this, 100); - MemoryUsageInMegaByte?.Invoke(this, 1000); + PerformanceMetricsUpdated?.Invoke(this, new TrialPerformanceMetrics() { PeakCpuUsage = 100, PeakMemoryUsage = 1000 }); }; _timer.AutoReset = true; - _timer.Enabled = true; } + _timer.Enabled = true; + } + + public void Pause() + { + _timer.Enabled = false; } public void Stop()