Skip to content

Commit d794383

Browse files
authored
Adding Debugging Scenario tests for V1 APIs (#2937)
* Adding Debugging Scenario tests for V1 APIs
1 parent 9f87099 commit d794383

File tree

2 files changed

+214
-39
lines changed

2 files changed

+214
-39
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,214 @@
1+
// Licensed to the .NET Foundation under one or more agreements.
2+
// The .NET Foundation licenses this file to you under the MIT license.
3+
// See the LICENSE file in the project root for more information.
4+
5+
using System.Collections.Generic;
6+
using Microsoft.ML.Data;
7+
using Microsoft.ML.Functional.Tests.Datasets;
8+
using Microsoft.ML.RunTests;
9+
using Microsoft.ML.TestFramework;
10+
using Microsoft.ML.Trainers;
11+
using Microsoft.ML.Transforms.Text;
12+
using Xunit;
13+
using Xunit.Abstractions;
14+
15+
namespace Microsoft.ML.Functional.Tests
16+
{
17+
public class Debugging : BaseTestClass
18+
{
19+
public Debugging(ITestOutputHelper output) : base(output)
20+
{
21+
}
22+
23+
/// <summary>
24+
/// Debugging: The individual pipeline steps can be inspected to see what is happening to
25+
/// data as it flows through.
26+
/// </summary>
27+
/// <remarks>
28+
/// It should, possibly through the debugger, be not such a pain to actually
29+
/// see what is happening to your data when you apply this or that transform. For example, if I
30+
/// were to have the text "Help I'm a bug!" I should be able to see the steps where it is
31+
/// normalized to "help i'm a bug" then tokenized into ["help", "i'm", "a", "bug"] then
32+
/// mapped into term numbers [203, 25, 3, 511] then projected into the sparse
33+
/// float vector {3:1, 25:1, 203:1, 511:1}, etc. etc.
34+
/// </remarks>
35+
[Fact]
36+
void InspectIntermediatePipelineSteps()
37+
{
38+
var mlContext = new MLContext(seed: 1);
39+
40+
var data = mlContext.Data.LoadFromEnumerable<TweetSentiment>(
41+
new TweetSentiment[]
42+
{
43+
new TweetSentiment { Sentiment = true, SentimentText = "I love ML.NET." },
44+
new TweetSentiment { Sentiment = true, SentimentText = "I love TLC." },
45+
new TweetSentiment { Sentiment = false, SentimentText = "I dislike fika." }
46+
});
47+
48+
// create a training pipeline.
49+
var pipeline = mlContext.Transforms.Text.FeaturizeText(
50+
"Features",
51+
new TextFeaturizingEstimator.Options
52+
{
53+
KeepPunctuations = false,
54+
OutputTokens = true,
55+
CharFeatureExtractor = null, // new WordBagEstimator.Options { NgramLength = 0, SkipLength = -1 },
56+
WordFeatureExtractor = new WordBagEstimator.Options { NgramLength = 1},
57+
Norm = TextFeaturizingEstimator.NormFunction.None
58+
},
59+
"SentimentText");
60+
61+
// Fit the pipeline to the data.
62+
var model = pipeline.Fit(data);
63+
64+
// Transform the data.
65+
var transformedData = model.Transform(data);
66+
67+
var preview = transformedData.Preview();
68+
69+
// Verify that columns can be inspected.
70+
// Validate the tokens column.
71+
var tokensColumn = transformedData.GetColumn<string[]>(transformedData.Schema["Features_TransformedText"]);
72+
var expectedTokens = new string[3][]
73+
{
74+
new string[] {"i", "love", "mlnet"},
75+
new string[] {"i", "love", "tlc"},
76+
new string[] {"i", "dislike", "fika"},
77+
};
78+
int i = 0;
79+
foreach (var rowTokens in tokensColumn)
80+
Assert.Equal(expectedTokens[i++], rowTokens);
81+
82+
// Validate the Features column.
83+
var featuresColumn = transformedData.GetColumn<float[]>(transformedData.Schema["Features"]);
84+
var expectedFeatures = new float[3][]
85+
{
86+
new float[6] { 1, 1, 1, 0, 0 ,0 },
87+
new float[6] { 1, 1, 0, 1, 0, 0 },
88+
new float[6] { 1, 0, 0, 0, 1, 1 }
89+
};
90+
i = 0;
91+
foreach (var rowFeatures in featuresColumn)
92+
Assert.Equal(expectedFeatures[i++], rowFeatures);
93+
}
94+
95+
/// <summary>
96+
/// Debugging: The schema of the pipeline can be inspected.
97+
/// </summary>
98+
[Fact]
99+
public void InspectPipelineSchema()
100+
{
101+
var mlContext = new MLContext(seed: 1);
102+
103+
// Get the dataset.
104+
var data = mlContext.Data.LoadFromTextFile<HousingRegression>(GetDataPath(TestDatasets.housing.trainFilename), hasHeader: true);
105+
106+
// Define a pipeline
107+
var pipeline = mlContext.Transforms.Concatenate("Features", HousingRegression.Features)
108+
.Append(mlContext.Transforms.Normalize())
109+
.AppendCacheCheckpoint(mlContext)
110+
.Append(mlContext.Regression.Trainers.Sdca(
111+
new SdcaRegressionTrainer.Options { NumberOfThreads = 1, MaximumNumberOfIterations = 20 }));
112+
113+
// Fit the pipeline to the data.
114+
var model = pipeline.Fit(data);
115+
116+
// Inspect the model schema, and verify that a Score column is produced.
117+
var outputSchema = model.GetOutputSchema(data.Schema);
118+
var columnNames = new string[outputSchema.Count];
119+
int i = 0;
120+
foreach (var column in outputSchema)
121+
columnNames[i++] = column.Name;
122+
Assert.Contains("Score", columnNames);
123+
}
124+
125+
/// <summary>
126+
/// Debugging: The schema read in can be verified by inspecting the data.
127+
/// </summary>
128+
[Fact]
129+
public void InspectSchemaUponLoadingData()
130+
{
131+
var mlContext = new MLContext(seed: 1);
132+
133+
// Get the dataset.
134+
var data = mlContext.Data.LoadFromTextFile<HousingRegression>(GetDataPath(TestDatasets.housing.trainFilename), hasHeader: true);
135+
136+
// Verify the column names.
137+
int i = 0;
138+
foreach (var column in data.Schema)
139+
{
140+
if (i == 0)
141+
Assert.Equal("Label", column.Name);
142+
else
143+
Assert.Equal(HousingRegression.Features[i-1], column.Name);
144+
i++;
145+
}
146+
147+
// Verify that I can cast it to the right schema by inspecting the first row.
148+
foreach (var row in mlContext.Data.CreateEnumerable<HousingRegression>(mlContext.Data.TakeRows(data, 1), true))
149+
{
150+
// Validate there was data in the row by checking that some values were not zero since zero is the default.
151+
var rowSum = row.MedianHomeValue;
152+
foreach (var property in HousingRegression.Features)
153+
rowSum += (float) row.GetType().GetProperty(property).GetValue(row, null);
154+
155+
Assert.NotEqual(0, rowSum);
156+
}
157+
}
158+
159+
/// <summary>
160+
/// Debugging: The progress of training can be accessed.
161+
/// </summary>
162+
[Fact]
163+
public void ViewTrainingOutput()
164+
{
165+
var mlContext = new MLContext(seed: 1);
166+
167+
// Attach a listener.
168+
var logWatcher = new LogWatcher();
169+
mlContext.Log += logWatcher.ObserveEvent;
170+
171+
// Get the dataset.
172+
var data = mlContext.Data.LoadFromTextFile<HousingRegression>(GetDataPath(TestDatasets.housing.trainFilename), hasHeader: true);
173+
174+
// Define a pipeline
175+
var pipeline = mlContext.Transforms.Concatenate("Features", HousingRegression.Features)
176+
.Append(mlContext.Transforms.Normalize())
177+
.AppendCacheCheckpoint(mlContext)
178+
.Append(mlContext.Regression.Trainers.Sdca(
179+
new SdcaRegressionTrainer.Options { NumberOfThreads = 1, MaximumNumberOfIterations = 20 }));
180+
181+
// Fit the pipeline to the data.
182+
var model = pipeline.Fit(data);
183+
184+
// Validate that we can read lines from the file.
185+
var expectedLines = new string[3] {
186+
@"[Source=SdcaTrainerBase; Training, Kind=Info] Auto-tuning parameters: L2 = 0.001.",
187+
@"[Source=SdcaTrainerBase; Training, Kind=Info] Auto-tuning parameters: L1Threshold (L1/L2) = 0.",
188+
@"[Source=SdcaTrainerBase; Training, Kind=Info] Using best model from iteration 7."};
189+
foreach (var line in expectedLines)
190+
{
191+
Assert.Contains(line, logWatcher.Lines);
192+
Assert.Equal(1, logWatcher.Lines[line]);
193+
}
194+
}
195+
196+
internal class LogWatcher {
197+
198+
public readonly IDictionary<string, int> Lines;
199+
200+
public LogWatcher()
201+
{
202+
Lines = new Dictionary<string, int>();
203+
}
204+
205+
public void ObserveEvent(object sender, LoggingEventArgs e)
206+
{
207+
if (Lines.ContainsKey(e.Message))
208+
Lines[e.Message]++;
209+
else
210+
Lines[e.Message] = 1;
211+
}
212+
}
213+
}
214+
}

test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Visibility.cs

-39
This file was deleted.

0 commit comments

Comments
 (0)