|
| 1 | +// Licensed to the .NET Foundation under one or more agreements. |
| 2 | +// The .NET Foundation licenses this file to you under the MIT license. |
| 3 | +// See the LICENSE file in the project root for more information. |
| 4 | + |
| 5 | +using System.Collections.Generic; |
| 6 | +using Microsoft.ML.Data; |
| 7 | +using Microsoft.ML.Functional.Tests.Datasets; |
| 8 | +using Microsoft.ML.RunTests; |
| 9 | +using Microsoft.ML.TestFramework; |
| 10 | +using Microsoft.ML.Trainers; |
| 11 | +using Microsoft.ML.Transforms.Text; |
| 12 | +using Xunit; |
| 13 | +using Xunit.Abstractions; |
| 14 | + |
| 15 | +namespace Microsoft.ML.Functional.Tests |
| 16 | +{ |
| 17 | + public class Debugging : BaseTestClass |
| 18 | + { |
| 19 | + public Debugging(ITestOutputHelper output) : base(output) |
| 20 | + { |
| 21 | + } |
| 22 | + |
| 23 | + /// <summary> |
| 24 | + /// Debugging: The individual pipeline steps can be inspected to see what is happening to |
| 25 | + /// data as it flows through. |
| 26 | + /// </summary> |
| 27 | + /// <remarks> |
| 28 | + /// It should, possibly through the debugger, be not such a pain to actually |
| 29 | + /// see what is happening to your data when you apply this or that transform. For example, if I |
| 30 | + /// were to have the text "Help I'm a bug!" I should be able to see the steps where it is |
| 31 | + /// normalized to "help i'm a bug" then tokenized into ["help", "i'm", "a", "bug"] then |
| 32 | + /// mapped into term numbers [203, 25, 3, 511] then projected into the sparse |
| 33 | + /// float vector {3:1, 25:1, 203:1, 511:1}, etc. etc. |
| 34 | + /// </remarks> |
| 35 | + [Fact] |
| 36 | + void InspectIntermediatePipelineSteps() |
| 37 | + { |
| 38 | + var mlContext = new MLContext(seed: 1); |
| 39 | + |
| 40 | + var data = mlContext.Data.LoadFromEnumerable<TweetSentiment>( |
| 41 | + new TweetSentiment[] |
| 42 | + { |
| 43 | + new TweetSentiment { Sentiment = true, SentimentText = "I love ML.NET." }, |
| 44 | + new TweetSentiment { Sentiment = true, SentimentText = "I love TLC." }, |
| 45 | + new TweetSentiment { Sentiment = false, SentimentText = "I dislike fika." } |
| 46 | + }); |
| 47 | + |
| 48 | + // create a training pipeline. |
| 49 | + var pipeline = mlContext.Transforms.Text.FeaturizeText( |
| 50 | + "Features", |
| 51 | + new TextFeaturizingEstimator.Options |
| 52 | + { |
| 53 | + KeepPunctuations = false, |
| 54 | + OutputTokens = true, |
| 55 | + CharFeatureExtractor = null, // new WordBagEstimator.Options { NgramLength = 0, SkipLength = -1 }, |
| 56 | + WordFeatureExtractor = new WordBagEstimator.Options { NgramLength = 1}, |
| 57 | + Norm = TextFeaturizingEstimator.NormFunction.None |
| 58 | + }, |
| 59 | + "SentimentText"); |
| 60 | + |
| 61 | + // Fit the pipeline to the data. |
| 62 | + var model = pipeline.Fit(data); |
| 63 | + |
| 64 | + // Transform the data. |
| 65 | + var transformedData = model.Transform(data); |
| 66 | + |
| 67 | + var preview = transformedData.Preview(); |
| 68 | + |
| 69 | + // Verify that columns can be inspected. |
| 70 | + // Validate the tokens column. |
| 71 | + var tokensColumn = transformedData.GetColumn<string[]>(transformedData.Schema["Features_TransformedText"]); |
| 72 | + var expectedTokens = new string[3][] |
| 73 | + { |
| 74 | + new string[] {"i", "love", "mlnet"}, |
| 75 | + new string[] {"i", "love", "tlc"}, |
| 76 | + new string[] {"i", "dislike", "fika"}, |
| 77 | + }; |
| 78 | + int i = 0; |
| 79 | + foreach (var rowTokens in tokensColumn) |
| 80 | + Assert.Equal(expectedTokens[i++], rowTokens); |
| 81 | + |
| 82 | + // Validate the Features column. |
| 83 | + var featuresColumn = transformedData.GetColumn<float[]>(transformedData.Schema["Features"]); |
| 84 | + var expectedFeatures = new float[3][] |
| 85 | + { |
| 86 | + new float[6] { 1, 1, 1, 0, 0 ,0 }, |
| 87 | + new float[6] { 1, 1, 0, 1, 0, 0 }, |
| 88 | + new float[6] { 1, 0, 0, 0, 1, 1 } |
| 89 | + }; |
| 90 | + i = 0; |
| 91 | + foreach (var rowFeatures in featuresColumn) |
| 92 | + Assert.Equal(expectedFeatures[i++], rowFeatures); |
| 93 | + } |
| 94 | + |
| 95 | + /// <summary> |
| 96 | + /// Debugging: The schema of the pipeline can be inspected. |
| 97 | + /// </summary> |
| 98 | + [Fact] |
| 99 | + public void InspectPipelineSchema() |
| 100 | + { |
| 101 | + var mlContext = new MLContext(seed: 1); |
| 102 | + |
| 103 | + // Get the dataset. |
| 104 | + var data = mlContext.Data.LoadFromTextFile<HousingRegression>(GetDataPath(TestDatasets.housing.trainFilename), hasHeader: true); |
| 105 | + |
| 106 | + // Define a pipeline |
| 107 | + var pipeline = mlContext.Transforms.Concatenate("Features", HousingRegression.Features) |
| 108 | + .Append(mlContext.Transforms.Normalize()) |
| 109 | + .AppendCacheCheckpoint(mlContext) |
| 110 | + .Append(mlContext.Regression.Trainers.Sdca( |
| 111 | + new SdcaRegressionTrainer.Options { NumberOfThreads = 1, MaximumNumberOfIterations = 20 })); |
| 112 | + |
| 113 | + // Fit the pipeline to the data. |
| 114 | + var model = pipeline.Fit(data); |
| 115 | + |
| 116 | + // Inspect the model schema, and verify that a Score column is produced. |
| 117 | + var outputSchema = model.GetOutputSchema(data.Schema); |
| 118 | + var columnNames = new string[outputSchema.Count]; |
| 119 | + int i = 0; |
| 120 | + foreach (var column in outputSchema) |
| 121 | + columnNames[i++] = column.Name; |
| 122 | + Assert.Contains("Score", columnNames); |
| 123 | + } |
| 124 | + |
| 125 | + /// <summary> |
| 126 | + /// Debugging: The schema read in can be verified by inspecting the data. |
| 127 | + /// </summary> |
| 128 | + [Fact] |
| 129 | + public void InspectSchemaUponLoadingData() |
| 130 | + { |
| 131 | + var mlContext = new MLContext(seed: 1); |
| 132 | + |
| 133 | + // Get the dataset. |
| 134 | + var data = mlContext.Data.LoadFromTextFile<HousingRegression>(GetDataPath(TestDatasets.housing.trainFilename), hasHeader: true); |
| 135 | + |
| 136 | + // Verify the column names. |
| 137 | + int i = 0; |
| 138 | + foreach (var column in data.Schema) |
| 139 | + { |
| 140 | + if (i == 0) |
| 141 | + Assert.Equal("Label", column.Name); |
| 142 | + else |
| 143 | + Assert.Equal(HousingRegression.Features[i-1], column.Name); |
| 144 | + i++; |
| 145 | + } |
| 146 | + |
| 147 | + // Verify that I can cast it to the right schema by inspecting the first row. |
| 148 | + foreach (var row in mlContext.Data.CreateEnumerable<HousingRegression>(mlContext.Data.TakeRows(data, 1), true)) |
| 149 | + { |
| 150 | + // Validate there was data in the row by checking that some values were not zero since zero is the default. |
| 151 | + var rowSum = row.MedianHomeValue; |
| 152 | + foreach (var property in HousingRegression.Features) |
| 153 | + rowSum += (float) row.GetType().GetProperty(property).GetValue(row, null); |
| 154 | + |
| 155 | + Assert.NotEqual(0, rowSum); |
| 156 | + } |
| 157 | + } |
| 158 | + |
| 159 | + /// <summary> |
| 160 | + /// Debugging: The progress of training can be accessed. |
| 161 | + /// </summary> |
| 162 | + [Fact] |
| 163 | + public void ViewTrainingOutput() |
| 164 | + { |
| 165 | + var mlContext = new MLContext(seed: 1); |
| 166 | + |
| 167 | + // Attach a listener. |
| 168 | + var logWatcher = new LogWatcher(); |
| 169 | + mlContext.Log += logWatcher.ObserveEvent; |
| 170 | + |
| 171 | + // Get the dataset. |
| 172 | + var data = mlContext.Data.LoadFromTextFile<HousingRegression>(GetDataPath(TestDatasets.housing.trainFilename), hasHeader: true); |
| 173 | + |
| 174 | + // Define a pipeline |
| 175 | + var pipeline = mlContext.Transforms.Concatenate("Features", HousingRegression.Features) |
| 176 | + .Append(mlContext.Transforms.Normalize()) |
| 177 | + .AppendCacheCheckpoint(mlContext) |
| 178 | + .Append(mlContext.Regression.Trainers.Sdca( |
| 179 | + new SdcaRegressionTrainer.Options { NumberOfThreads = 1, MaximumNumberOfIterations = 20 })); |
| 180 | + |
| 181 | + // Fit the pipeline to the data. |
| 182 | + var model = pipeline.Fit(data); |
| 183 | + |
| 184 | + // Validate that we can read lines from the file. |
| 185 | + var expectedLines = new string[3] { |
| 186 | + @"[Source=SdcaTrainerBase; Training, Kind=Info] Auto-tuning parameters: L2 = 0.001.", |
| 187 | + @"[Source=SdcaTrainerBase; Training, Kind=Info] Auto-tuning parameters: L1Threshold (L1/L2) = 0.", |
| 188 | + @"[Source=SdcaTrainerBase; Training, Kind=Info] Using best model from iteration 7."}; |
| 189 | + foreach (var line in expectedLines) |
| 190 | + { |
| 191 | + Assert.Contains(line, logWatcher.Lines); |
| 192 | + Assert.Equal(1, logWatcher.Lines[line]); |
| 193 | + } |
| 194 | + } |
| 195 | + |
| 196 | + internal class LogWatcher { |
| 197 | + |
| 198 | + public readonly IDictionary<string, int> Lines; |
| 199 | + |
| 200 | + public LogWatcher() |
| 201 | + { |
| 202 | + Lines = new Dictionary<string, int>(); |
| 203 | + } |
| 204 | + |
| 205 | + public void ObserveEvent(object sender, LoggingEventArgs e) |
| 206 | + { |
| 207 | + if (Lines.ContainsKey(e.Message)) |
| 208 | + Lines[e.Message]++; |
| 209 | + else |
| 210 | + Lines[e.Message] = 1; |
| 211 | + } |
| 212 | + } |
| 213 | + } |
| 214 | +} |
0 commit comments