Skip to content

Commit 80bde05

Browse files
committed
resolving review comments and adding samples
1 parent b869767 commit 80bde05

File tree

11 files changed

+312
-5
lines changed

11 files changed

+312
-5
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
using System;
2+
using System.Linq;
3+
using Microsoft.ML.Data;
4+
using Microsoft.ML.Trainers;
5+
6+
namespace Microsoft.ML.Samples.Dynamic
7+
{
8+
public class PriorTrainerSample
9+
{
10+
public static void Example()
11+
{
12+
// Downloading the dataset from github.com/dotnet/machinelearning.
13+
// This will create a sentiment.tsv file in the filesystem.
14+
// You can open this file, if you want to see the data.
15+
string dataFile = SamplesUtils.DatasetUtils.DownloadSentimentDataset();
16+
17+
// A preview of the data.
18+
// Sentiment SentimentText
19+
// 0 " :Erm, thank you. "
20+
// 1 ==You're cool==
21+
22+
// Create a new context for ML.NET operations. It can be used for exception tracking and logging,
23+
// as a catalog of available operations and as the source of randomness.
24+
var mlContext = new MLContext();
25+
26+
// Step 1: Read the data as an IDataView.
27+
// First, we define the reader: specify the data columns and where to find them in the text file.
28+
var reader = mlContext.Data.CreateTextLoader(
29+
columns: new[]
30+
{
31+
new TextLoader.Column("Sentiment", DataKind.R4, 0),
32+
new TextLoader.Column("SentimentText", DataKind.Text, 1)
33+
},
34+
hasHeader: true
35+
);
36+
37+
// Read the data
38+
var data = reader.Read(dataFile);
39+
40+
// Split it between training and test data
41+
var (train, test) = mlContext.BinaryClassification.TrainTestSplit(data);
42+
43+
// ML.NET doesn't cache data set by default. Therefore, if one reads a data set from a file and accesses it many times,
44+
// it can be slow due to expensive featurization and disk operations. When the considered data can fit into memory, a
45+
// solution is to cache the data in memory. Caching is especially helpful when working with iterative algorithms which
46+
// needs many data passes. Since SDCA is the case, we cache. Inserting a cache step in a pipeline is also possible,
47+
// please see the construction of pipeline below.
48+
data = mlContext.Data.Cache(data);
49+
50+
// Step 2: Pipeline
51+
// Featurize the text column through the FeaturizeText API.
52+
// Then append a binary classifier, setting the "Label" column as the label of the dataset, and
53+
// the "Features" column produced by FeaturizeText as the features column.
54+
var pipeline = mlContext.Transforms.Text.FeaturizeText("Features", "SentimentText")
55+
.AppendCacheCheckpoint(mlContext) // Add a data-cache step within a pipeline.
56+
.Append(mlContext.BinaryClassification.Trainers.Prior(labelColumn: "Sentiment"));
57+
58+
// Step 3: Train the pipeline
59+
var trainedPipeline = pipeline.Fit(train);
60+
61+
// Step 4: Evaluate on the test set
62+
var transformedData = trainedPipeline.Transform(test);
63+
var evalMetrics = mlContext.BinaryClassification.Evaluate(transformedData, label: "Sentiment");
64+
65+
// Step 5: Inspect the output
66+
Console.WriteLine("Accuracy: " + evalMetrics.Accuracy);
67+
68+
// Expected output:
69+
// Accuracy: 0.647058823529412
70+
}
71+
}
72+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
using System;
2+
using System.Linq;
3+
using Microsoft.ML.Data;
4+
using Microsoft.ML.Trainers;
5+
6+
namespace Microsoft.ML.Samples.Dynamic
7+
{
8+
public class RandomTrainerSample
9+
{
10+
public static void Example()
11+
{
12+
// Downloading the dataset from github.com/dotnet/machinelearning.
13+
// This will create a sentiment.tsv file in the filesystem.
14+
// You can open this file, if you want to see the data.
15+
string dataFile = SamplesUtils.DatasetUtils.DownloadSentimentDataset();
16+
17+
// A preview of the data.
18+
// Sentiment SentimentText
19+
// 0 " :Erm, thank you. "
20+
// 1 ==You're cool==
21+
22+
// Create a new context for ML.NET operations. It can be used for exception tracking and logging,
23+
// as a catalog of available operations and as the source of randomness.
24+
var mlContext = new MLContext(seed: 1);
25+
26+
// Step 1: Read the data as an IDataView.
27+
// First, we define the reader: specify the data columns and where to find them in the text file.
28+
var reader = mlContext.Data.CreateTextLoader(
29+
columns: new[]
30+
{
31+
new TextLoader.Column("Sentiment", DataKind.R4, 0),
32+
new TextLoader.Column("SentimentText", DataKind.Text, 1)
33+
},
34+
hasHeader: true
35+
);
36+
37+
// Read the data
38+
var data = reader.Read(dataFile);
39+
40+
// Split it between training and test data
41+
var (train, test) = mlContext.BinaryClassification.TrainTestSplit(data);
42+
43+
// ML.NET doesn't cache data set by default. Therefore, if one reads a data set from a file and accesses it many times,
44+
// it can be slow due to expensive featurization and disk operations. When the considered data can fit into memory, a
45+
// solution is to cache the data in memory. Caching is especially helpful when working with iterative algorithms which
46+
// needs many data passes. Since SDCA is the case, we cache. Inserting a cache step in a pipeline is also possible,
47+
// please see the construction of pipeline below.
48+
data = mlContext.Data.Cache(data);
49+
50+
// Step 2: Pipeline
51+
// Featurize the text column through the FeaturizeText API.
52+
// Then append a binary classifier, setting the "Label" column as the label of the dataset, and
53+
// the "Features" column produced by FeaturizeText as the features column.
54+
var pipeline = mlContext.Transforms.Text.FeaturizeText("Features", "SentimentText")
55+
.AppendCacheCheckpoint(mlContext) // Add a data-cache step within a pipeline.
56+
.Append(mlContext.BinaryClassification.Trainers.Random());
57+
58+
// Step 3: Train the pipeline
59+
var trainedPipeline = pipeline.Fit(train);
60+
61+
// Step 4: Evaluate on the test set
62+
var transformedData = trainedPipeline.Transform(test);
63+
var evalMetrics = mlContext.BinaryClassification.Evaluate(transformedData, label: "Sentiment");
64+
65+
// Step 5: Inspect the output
66+
Console.WriteLine("Accuracy: " + evalMetrics.Accuracy);
67+
68+
// Expected output (close to 0.5):
69+
// Accuracy: 0.588235294117647
70+
}
71+
}
72+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
using System;
2+
using Microsoft.ML.Data;
3+
4+
namespace Microsoft.ML.Samples.Dynamic
5+
{
6+
public class CustomMappingSample
7+
{
8+
public static void Example()
9+
{
10+
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
11+
// as well as the source of randomness.
12+
var mlContext = new MLContext();
13+
14+
// Get a small dataset as an IEnumerable and convert it to an IDataView.
15+
var data = SamplesUtils.DatasetUtils.GetInfertData();
16+
var trainData = mlContext.Data.ReadFromEnumerable(data);
17+
18+
// Preview of the data.
19+
//
20+
// Age Case Education Induced Parity PooledStratum RowNum ...
21+
// 26 1 0-5yrs 1 6 3 1 ...
22+
// 42 1 0-5yrs 1 1 1 2 ...
23+
// 39 1 0-5yrs 2 6 4 3 ...
24+
// 34 1 0-5yrs 2 4 2 4 ...
25+
// 35 1 6-11yrs 1 3 32 5 ...
26+
27+
// We define the custom mapping between input and output rows that will be applied by the transformation.
28+
Action<SamplesUtils.DatasetUtils.SampleInfertData, SampleInfertDataTransformed> mapping =
29+
(input, output) => output.IsUnderThirty = input.Age < 30;
30+
31+
// Custom transformations can be used to transform data directly, or as part of a pipeline. Below we transform data directly.
32+
var transformer = mlContext.Transforms.CustomMappingTransformer(mapping, null);
33+
var transformedData = transformer.Transform(trainData);
34+
35+
// Preview of the data.
36+
//
37+
// IsUnderThirty Age Case Education Induced Parity PooledStratum RowNum ...
38+
// true 26 1 0-5yrs 1 6 3 1 ...
39+
// false 42 1 0-5yrs 1 1 1 2 ...
40+
// false 39 1 0-5yrs 2 6 4 3 ...
41+
// false 34 1 0-5yrs 2 4 2 4 ...
42+
// false 35 1 6-11yrs 1 3 32 5 ...
43+
44+
// Here instead we use it as part of a pipeline of estimators.
45+
var pipeline = mlContext.Transforms.CustomMapping(mapping, null)
46+
.Append(mlContext.Transforms.Concatenate(outputColumnName: "Features", inputColumnNames: new[] { "Parity", "Induced" }))
47+
// It is useful to add a caching checkpoint before a trainer that does several passes over the data.
48+
.AppendCacheCheckpoint(mlContext)
49+
// We use binary FastTree to predict the label column that was generated by the custom mapping at the first step of the pipeline.
50+
.Append(mlContext.BinaryClassification.Trainers.FastTree(labelColumn: "IsUnderThirty"));
51+
52+
// We can train the pipeline and use it to transform data.
53+
transformedData = pipeline.Fit(trainData).Transform(trainData);
54+
}
55+
56+
// Represents the transformed infertility dataset.
57+
public class SampleInfertDataTransformed
58+
{
59+
public int RowNum { get; set; }
60+
public string Education { get; set; }
61+
public bool IsUnderThirty { get; set; }
62+
public float Parity { get; set; }
63+
public float Induced { get; set; }
64+
public float Case { get; set; }
65+
public float Spontaneous { get; set; }
66+
public float Stratum { get; set; }
67+
public float PooledStratum { get; set; }
68+
}
69+
}
70+
}

src/Microsoft.ML.Data/Scorers/PredictionTransformer.cs

+4
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,10 @@ public abstract class PredictionTransformerBase<TModel> : IPredictionTransformer
5454
private protected ISchemaBindableMapper BindableMapper;
5555
protected Schema TrainSchema;
5656

57+
/// <summary>
58+
/// Whether a call to <see cref="GetRowToRowMapper(Schema)"/> should succeed, on an
59+
/// appropriate schema.
60+
/// </summary>
5761
public bool IsRowToRowMapper => true;
5862

5963
/// <summary>

src/Microsoft.ML.StandardLearners/Standard/MultiClass/MultiClassNaiveBayesTrainer.cs

+7
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,15 @@ internal sealed class Options : LearnerInputBaseWithLabel
3939
{
4040
}
4141

42+
/// <summary> Return the type of prediction task.</summary>
4243
public override PredictionKind PredictionKind => PredictionKind.MultiClassClassification;
4344

4445
private static readonly TrainerInfo _info = new TrainerInfo(normalization: false, caching: false);
46+
47+
/// <summary>
48+
/// Auxiliary information about the trainer in terms of its capabilities
49+
/// and requirements.
50+
/// </summary>
4551
public override TrainerInfo Info => _info;
4652

4753
/// <summary>
@@ -201,6 +207,7 @@ private static VersionInfo GetVersionInfo()
201207
private readonly VectorType _inputType;
202208
private readonly VectorType _outputType;
203209

210+
/// <summary> Return the type of prediction task.</summary>
204211
public override PredictionKind PredictionKind => PredictionKind.MultiClassClassification;
205212

206213
ColumnType IValueMapper.InputType => _inputType;

src/Microsoft.ML.StandardLearners/Standard/MultiClass/Ova.cs

+6
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
using Microsoft.Data.DataView;
1212
using Microsoft.ML;
1313
using Microsoft.ML.CommandLine;
14+
using Microsoft.ML.Core.Data;
1415
using Microsoft.ML.Data;
1516
using Microsoft.ML.EntryPoints;
1617
using Microsoft.ML.Internal.Calibration;
@@ -54,6 +55,9 @@ public sealed class Ova : MetaMulticlassTrainer<MulticlassPredictionTransformer<
5455
/// </summary>
5556
internal sealed class Options : ArgumentsBase
5657
{
58+
/// <summary>
59+
/// Whether to use probabilities (vs. raw outputs) to identify top-score category.
60+
/// </summary>
5761
[Argument(ArgumentType.AtMostOnce, HelpText = "Use probability or margins to determine max", ShortName = "useprob")]
5862
[TGUI(Label = "Use Probability", Description = "Use probabilities (vs. raw outputs) to identify top-score category")]
5963
public bool UseProbabilities = true;
@@ -169,6 +173,7 @@ private IDataView MapLabels(RoleMappedData data, int cls)
169173
throw Host.ExceptNotSupp($"Label column type is not supported by OVA: {lab.Type}");
170174
}
171175

176+
/// <summary> Trains and returns a <see cref="ITransformer"/>.</summary>
172177
public override MulticlassPredictionTransformer<OvaModelParameters> Fit(IDataView input)
173178
{
174179
var roles = new KeyValuePair<CR, string>[1];
@@ -227,6 +232,7 @@ private static VersionInfo GetVersionInfo()
227232

228233
public ImmutableArray<object> SubModelParameters => _impl.Predictors.Cast<object>().ToImmutableArray();
229234

235+
/// <summary> Return the type of prediction task.</summary>
230236
public override PredictionKind PredictionKind => PredictionKind.MultiClassClassification;
231237

232238
/// <summary>

src/Microsoft.ML.StandardLearners/Standard/MultiClass/Pkpd.cs

+1
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,7 @@ private static VersionInfo GetVersionInfo()
242242
private readonly TDistPredictor[] _predictors;
243243
private readonly IValueMapperDist[] _mappers;
244244

245+
/// <summary> Return the type of prediction task.</summary>
245246
public override PredictionKind PredictionKind => PredictionKind.MultiClassClassification;
246247
private readonly VectorType _inputType;
247248
private readonly ColumnType _outputType;

0 commit comments

Comments
 (0)