Skip to content

Add continuous resource monitoring to AutoML.IMonitor #6520

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
9647027
Fix a typo
andrasfuchs Nov 25, 2022
269b1bd
Fix trial cancellation bug
andrasfuchs Nov 25, 2022
a2c5781
Move performance related properties to TrialPerformanceMetrics and ad…
andrasfuchs Nov 25, 2022
e3fd992
Add new class and property explanations
andrasfuchs Nov 25, 2022
88fdefa
Revert "Fix trial cancellation bug"
andrasfuchs Dec 2, 2022
b03e46a
Remove pipeline info from the IMonitor Running event
andrasfuchs Dec 6, 2022
bf69dd2
Remove FreeSpaceOnDrives from TrialPerformanceMetrics
andrasfuchs Dec 6, 2022
38cf838
Change the default resource check interval to 5 seconds
andrasfuchs Dec 6, 2022
7f40df5
Remove StartedAtUtc property from TrialSettings
andrasfuchs Dec 22, 2022
8aa0ad8
move ReportTrialResourceUsage to IPerformanceMonitor
LittleLittleCloud Jan 3, 2023
739d865
Update AutoMLExperimentExtension.cs
LittleLittleCloud Jan 3, 2023
aeb651c
Merge pull request #2 from LittleLittleCloud/u/xiaoyun/add-cancellati…
andrasfuchs Jan 5, 2023
fc82c4c
Pause the performance monitor if the trial is not running
andrasfuchs Jan 6, 2023
d0ce0cd
Add StartedAtUtc and EndedAtUtc to TrialSettings
andrasfuchs Jan 8, 2023
4149a4b
cancel trial when as is
LittleLittleCloud Feb 6, 2023
7d3257a
fix tests
LittleLittleCloud Feb 6, 2023
c5c2d83
Merge branch 'main' into add-cancellation-and-resource-monitoring-to-…
LittleLittleCloud Feb 7, 2023
3919324
fix tests
LittleLittleCloud Feb 7, 2023
13ba949
fix tests
LittleLittleCloud Feb 7, 2023
488ff20
use workingset to evaluate memory usage
LittleLittleCloud Feb 8, 2023
49ac8ae
remove handler
LittleLittleCloud Feb 8, 2023
3722dcb
add handler back
LittleLittleCloud Feb 8, 2023
ff55857
add more logging
LittleLittleCloud Feb 8, 2023
509f963
add more logger
LittleLittleCloud Feb 8, 2023
1240335
add logging
LittleLittleCloud Feb 9, 2023
5a27af4
fix tests
LittleLittleCloud Feb 9, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions src/Microsoft.ML.AutoML.Interactive/NotebookMonitor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ public class NotebookMonitor : IMonitor
public List<TrialResult> CompletedTrials { get; set; }
public DataFrame TrialData { get; set; }

public int ResourceUsageCheckInterval => 5000;

public NotebookMonitor(SweepablePipeline pipeline)
{
CompletedTrials = new List<TrialResult>();
Expand Down Expand Up @@ -84,5 +86,9 @@ public void SetUpdate(DisplayedValue valueToUpdate)
_valueToUpdate = valueToUpdate;
ThrottledUpdate();
}

public void ReportTrialResourceUsage(TrialSettings setting)
{
}
}
}
51 changes: 37 additions & 14 deletions src/Microsoft.ML.AutoML/AutoMLExperiment/AutoMLExperiment.cs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
// See the LICENSE file in the project root for more information.

using System;
using System.Diagnostics;
using System.Linq;
using System.Text.Json;
using System.Threading;
Expand Down Expand Up @@ -249,19 +250,37 @@ public async Task<TrialResult> RunAsync(CancellationToken ct = default)
var trialNum = trialResultManager?.GetAllTrialResults().Max(t => t.TrialSettings?.TrialId) + 1 ?? 0;
var tuner = serviceProvider.GetService<ITuner>();
Contracts.Assert(tuner != null, "tuner can't be null");

while (!aggregateTrainingStopManager.IsStopTrainingRequested())
{
var setting = new TrialSettings()
var trialSettings = new TrialSettings()
{
TrialId = trialNum++,
Parameter = Parameter.CreateNestedParameter(),
StartedAtUtc = DateTime.UtcNow,
CancellationTokenSource = null,
PerformanceMetrics = new TrialPerformanceMetrics(),
};
var parameter = tuner.Propose(setting);
setting.Parameter = parameter;
var parameter = tuner.Propose(trialSettings);
trialSettings.Parameter = parameter;

monitor?.ReportRunningTrial(setting);
using (var trialCancellationTokenSource = new CancellationTokenSource())
{
trialSettings.CancellationTokenSource = trialCancellationTokenSource;
monitor?.ReportRunningTrial(trialSettings);

System.Timers.Timer resourceUsageTimer = null;
if ((monitor != null) && (monitor?.ResourceUsageCheckInterval > 0))
{
resourceUsageTimer = new System.Timers.Timer(monitor.ResourceUsageCheckInterval);
resourceUsageTimer.Elapsed += (o, e) =>
{
monitor?.ReportTrialResourceUsage(trialSettings);
};
resourceUsageTimer.AutoReset = true;
resourceUsageTimer.Enabled = false;
}

void handler(object o, EventArgs e)
{
// only force-canceling running trials when there's completed trials.
Expand All @@ -276,21 +295,25 @@ void handler(object o, EventArgs e)
{
aggregateTrainingStopManager.OnStopTraining += handler;

performanceMonitor.MemoryUsageInMegaByte += (o, m) =>
performanceMonitor.PerformanceMetricsUpdated += (o, metrics) =>
{
if (_settings.MaximumMemoryUsageInMegaByte is double d && m > d && !trialCancellationTokenSource.IsCancellationRequested)
trialSettings.PerformanceMetrics = metrics;

if (_settings.MaximumMemoryUsageInMegaByte is double d && metrics.PeakMemoryUsage > d && !trialCancellationTokenSource.IsCancellationRequested)
{
logger.Trace($"cancel current trial {setting.TrialId} because it uses {m} mb memory and the maximum memory usage is {d}");
logger.Trace($"cancel current trial {trialSettings.TrialId} because it uses {metrics.PeakMemoryUsage} mb memory and the maximum memory usage is {d}");
trialCancellationTokenSource.Cancel();

GC.AddMemoryPressure(Convert.ToInt64(m) * 1024 * 1024);
GC.AddMemoryPressure(Convert.ToInt64(metrics.PeakMemoryUsage) * 1024 * 1024);
GC.Collect();
}
};

var trialTask = runner.RunAsync(trialSettings, trialCancellationTokenSource.Token);
performanceMonitor.Start();
logger.Trace($"trial setting - {JsonSerializer.Serialize(setting)}");
var trialResult = await runner.RunAsync(setting, trialCancellationTokenSource.Token);
resourceUsageTimer?.Start();
logger.Trace($"trial setting - {JsonSerializer.Serialize(trialSettings)}");
var trialResult = await trialTask;

var peakCpu = performanceMonitor?.GetPeakCpuUsage();
var peakMemoryInMB = performanceMonitor?.GetPeakMemoryUsageInMegaByte();
Expand All @@ -313,10 +336,10 @@ void handler(object o, EventArgs e)
}
catch (OperationCanceledException ex) when (aggregateTrainingStopManager.IsStopTrainingRequested() == false)
{
monitor?.ReportFailTrial(setting, ex);
monitor?.ReportFailTrial(trialSettings, ex);
var result = new TrialResult
{
TrialSettings = setting,
TrialSettings = trialSettings,
Loss = double.MaxValue,
};

Expand All @@ -329,7 +352,7 @@ void handler(object o, EventArgs e)
}
catch (Exception ex)
{
monitor?.ReportFailTrial(setting, ex);
monitor?.ReportFailTrial(trialSettings, ex);

if (!aggregateTrainingStopManager.IsStopTrainingRequested() && _bestTrialResult == null)
{
Expand All @@ -343,7 +366,7 @@ void handler(object o, EventArgs e)
finally
{
aggregateTrainingStopManager.OnStopTraining -= handler;

resourceUsageTimer?.Stop();
}
}
}
Expand Down
21 changes: 15 additions & 6 deletions src/Microsoft.ML.AutoML/AutoMLExperiment/IMonitor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,15 @@ namespace Microsoft.ML.AutoML
/// </summary>
public interface IMonitor
{
/// <summary>
/// Interval in milliseconds to report resource usage.
/// </summary>
int ResourceUsageCheckInterval { get; }
void ReportCompletedTrial(TrialResult result);

void ReportBestTrial(TrialResult result);

void ReportFailTrial(TrialSettings settings, Exception exception = null);

void ReportRunningTrial(TrialSettings setting);
void ReportRunningTrial(TrialSettings settings);
void ReportTrialResourceUsage(TrialSettings settings);
}

/// <summary>
Expand All @@ -30,11 +32,14 @@ internal class MLContextMonitor : IMonitor
private readonly IChannel _logger;
private readonly List<TrialResult> _completedTrials;
private readonly SweepablePipeline _pipeline;
public MLContextMonitor(IChannel logger, SweepablePipeline pipeline)
public int ResourceUsageCheckInterval { get; private set; }

public MLContextMonitor(IChannel logger, SweepablePipeline pipeline, int resourceUsageCheckInterval = 5000)
{
_logger = logger;
_completedTrials = new List<TrialResult>();
_pipeline = pipeline;
ResourceUsageCheckInterval = resourceUsageCheckInterval;
}

public virtual void ReportBestTrial(TrialResult result)
Expand All @@ -55,7 +60,11 @@ public virtual void ReportFailTrial(TrialSettings settings, Exception exception

public virtual void ReportRunningTrial(TrialSettings setting)
{
_logger.Info($"Update Running Trial - Id: {setting.TrialId} - Pipeline: {_pipeline.ToString(setting.Parameter)}");
_logger.Info($"Update Running Trial - Id: {setting.TrialId}");
}

public void ReportTrialResourceUsage(TrialSettings setting)
{
}
}

Expand Down
23 changes: 15 additions & 8 deletions src/Microsoft.ML.AutoML/AutoMLExperiment/IPerformanceMonitor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Timers;
Expand All @@ -22,9 +24,7 @@ internal interface IPerformanceMonitor : IDisposable

double? GetPeakCpuUsage();

public event EventHandler<double> CpuUsage;

public event EventHandler<double> MemoryUsageInMegaByte;
public event EventHandler<TrialPerformanceMetrics> PerformanceMetricsUpdated;
}

internal class DefaultPerformanceMonitor : IPerformanceMonitor
Expand All @@ -43,9 +43,7 @@ public DefaultPerformanceMonitor(IChannel logger, int checkIntervalInMillisecond
}


public event EventHandler<double> CpuUsage;

public event EventHandler<double> MemoryUsageInMegaByte;
public event EventHandler<TrialPerformanceMetrics> PerformanceMetricsUpdated;


public void Dispose()
Expand Down Expand Up @@ -110,9 +108,18 @@ private void SampleCpuAndMemoryUsage()
// calculate Memory Usage in MB
var memoryUsage = process.PrivateMemorySize64 * 1.0 / (1024 * 1024);
_peakMemoryUsage = Math.Max(memoryUsage, _peakMemoryUsage ?? 0);

var metrics = new TrialPerformanceMetrics()
{
CpuUsage = cpuUsageInTotal,
MemoryUsage = memoryUsage,
PeakCpuUsage = _peakCpuUsage,
PeakMemoryUsage = _peakMemoryUsage
};

_logger?.Trace($"current CPU: {cpuUsageInTotal}, current Memory(mb): {memoryUsage}");
MemoryUsageInMegaByte?.Invoke(this, memoryUsage);
CpuUsage?.Invoke(this, cpuUsageInTotal);

PerformanceMetricsUpdated?.Invoke(this, metrics);
}
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System;
using System.Collections.Generic;
using System.Text;

namespace Microsoft.ML.AutoML
{
/// <summary>
/// Performance metrics for a trial.
/// </summary>
public class TrialPerformanceMetrics
{
/// <summary>
/// Peak memory usage during the trial in megabytes
/// </summary>
public double? PeakMemoryUsage { get; set; }
/// <summary>
/// Peak CPU usage during the trial
/// </summary>
public double? PeakCpuUsage { get; set; }
/// <summary>
/// Current CPU usage of the runner process
/// </summary>
public double CpuUsage { get; internal set; }
/// <summary>
/// Current memory usage of the runner process in megabytes
/// </summary>
public double MemoryUsage { get; internal set; }
}
}
27 changes: 26 additions & 1 deletion src/Microsoft.ML.AutoML/AutoMLExperiment/TrialSettings.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,39 @@
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System;
using System.Runtime.Serialization;
using System.Text.Json.Serialization;
using System.Threading;
using Microsoft.ML.SearchSpace;

namespace Microsoft.ML.AutoML
{
/// <summary>
/// Settings used for the trial
/// </summary>
public class TrialSettings
{
/// <summary>
/// Identifier of the trial
/// </summary>
public int TrialId { get; set; }

/// <summary>
/// Parameters for the pipeline used in this trial
/// </summary>
public Parameter Parameter { get; set; }
/// <summary>
/// Cancellation token source to have the ability to cancel the trial
/// </summary>
[JsonIgnore]
public CancellationTokenSource CancellationTokenSource { get; set; }
/// <summary>
/// Performance metrics of the trial
/// </summary>
public TrialPerformanceMetrics PerformanceMetrics { get; internal set; }
/// <summary>
/// The time when the trial started (UTC)
/// </summary>
public DateTime StartedAtUtc { get; internal set; }
}
}
2 changes: 1 addition & 1 deletion src/Microsoft.ML.SearchSpace/Parameter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ public enum ParameterType
}

/// <summary>
/// <see cref="Parameter"/> is used to save sweeping result from tuner and is used to restore mlnet pipeline from sweepable pipline.
/// <see cref="Parameter"/> is used to save sweeping result from tuner and is used to restore mlnet pipeline from sweepable pipeline.
/// </summary>
[JsonConverter(typeof(ParameterConverter))]
public sealed class Parameter : IDictionary<string, Parameter>, IEquatable<Parameter>, IEqualityComparer<Parameter>
Expand Down
7 changes: 2 additions & 5 deletions test/Microsoft.ML.AutoML.Tests/AutoMLExperimentTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -407,9 +407,7 @@ public DummyPeformanceMonitor()
_checkIntervalInMilliseconds = 1000;
}

public event EventHandler<double> CpuUsage;

public event EventHandler<double> MemoryUsageInMegaByte;
public event EventHandler<TrialPerformanceMetrics> PerformanceMetricsUpdated;

public void Dispose()
{
Expand All @@ -432,8 +430,7 @@ public void Start()
_timer = new System.Timers.Timer(_checkIntervalInMilliseconds);
_timer.Elapsed += (o, e) =>
{
CpuUsage?.Invoke(this, 100);
MemoryUsageInMegaByte?.Invoke(this, 1000);
PerformanceMetricsUpdated?.Invoke(this, new TrialPerformanceMetrics() { PeakCpuUsage = 100, PeakMemoryUsage = 1000 });
};

_timer.AutoReset = true;
Expand Down