Skip to content

Commit bbf68a8

Browse files
authored
Merge pull request #3174 from shauheen/release/rc1
Cherry-pick for RC1
2 parents 80cb3af + 51de060 commit bbf68a8

File tree

46 files changed

+830
-515
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+830
-515
lines changed

build/vsts-ci.yml

+6-6
Original file line numberDiff line numberDiff line change
@@ -236,12 +236,12 @@ phases:
236236
nuGetFeedType: internal
237237
feedPublish: MachineLearning
238238

239-
- task: MSBuild@1
240-
displayName: Publish Packages to MyGet Feed
241-
inputs:
242-
solution: build/publish.proj
243-
msbuildArguments: /t:PublishPackages /p:NuGetFeedUrl=$(_NuGetFeedUrl) /p:NuGetApiKey=$(dotnet-myget-org-api-key)
244-
msbuildVersion: 15.0
239+
# - task: MSBuild@1
240+
# displayName: Publish Packages to MyGet Feed
241+
# inputs:
242+
# solution: build/publish.proj
243+
# msbuildArguments: /t:PublishPackages /p:NuGetFeedUrl=$(_NuGetFeedUrl) /p:NuGetApiKey=$(dotnet-myget-org-api-key)
244+
# msbuildVersion: 15.0
245245

246246
- task: MSBuild@1
247247
displayName: Publish Symbols to SymWeb Symbol Server

docs/code/MlNetCookBook.md

+4-7
Original file line numberDiff line numberDiff line change
@@ -344,7 +344,7 @@ var cachedTrainData = mlContext.Data.Cache(trainData);
344344
var pipeline =
345345
// First 'normalize' the data (rescale to be
346346
// between -1 and 1 for all examples)
347-
mlContext.Transforms.Normalize("FeatureVector")
347+
mlContext.Transforms.NormalizeMinMax("FeatureVector")
348348
// We add a step for caching data in memory so that the downstream iterative training
349349
// algorithm can efficiently scan through the data multiple times. Otherwise, the following
350350
// trainer will load data from disk multiple times. The caching mechanism uses an on-demand strategy.
@@ -625,18 +625,15 @@ var trainData = mlContext.Data.LoadFromTextFile<IrisInputAllFeatures>(dataPath,
625625
separatorChar: ','
626626
);
627627

628-
// Apply all kinds of standard ML.NET normalization to the raw features.
628+
// Apply MinMax normalization to the raw features.
629629
var pipeline =
630-
mlContext.Transforms.Normalize(
631-
new NormalizingEstimator.MinMaxColumnOptions("MinMaxNormalized", "Features", fixZero: true),
632-
new NormalizingEstimator.MeanVarianceColumnOptions("MeanVarNormalized", "Features", fixZero: true),
633-
new NormalizingEstimator.BinningColumnOptions("BinNormalized", "Features", maximumBinCount: 256));
630+
mlContext.Transforms.NormalizeMinMax("MinMaxNormalized", "Features");
634631

635632
// Let's train our pipeline of normalizers, and then apply it to the same data.
636633
var normalizedData = pipeline.Fit(trainData).Transform(trainData);
637634

638635
// Inspect one column of the resulting dataset.
639-
var meanVarValues = normalizedData.GetColumn<float[]>(normalizedData.Schema["MeanVarNormalized"]).ToArray();
636+
var meanVarValues = normalizedData.GetColumn<float[]>(normalizedData.Schema["MinMaxNormalized"]).ToArray();
640637
```
641638

642639
## How do I train my model on categorical data?

docs/code/VBufferCareFeeding.md

+164-167
Large diffs are not rendered by default.

docs/samples/Microsoft.ML.Samples/Dynamic/DataOperations/BootstrapSample.cs

+2-2
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ public static void Example()
1212
var mlContext = new MLContext();
1313

1414
// Get a small dataset as an IEnumerable and them read it as ML.NET's data type.
15-
IEnumerable<SamplesUtils.DatasetUtils.BinaryLabelFloatFeatureVectorSample> enumerableOfData = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorSamples(5);
15+
IEnumerable<SamplesUtils.DatasetUtils.BinaryLabelFloatFeatureVectorFloatWeightSample> enumerableOfData = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(5);
1616
var data = mlContext.Data.LoadFromEnumerable(enumerableOfData);
1717

1818
// Look at the original dataset
@@ -43,7 +43,7 @@ public static void Example()
4343
{
4444
var resample = mlContext.Data.BootstrapSample(data, seed: i);
4545

46-
var enumerable = mlContext.Data.CreateEnumerable<SamplesUtils.DatasetUtils.BinaryLabelFloatFeatureVectorSample>(resample, reuseRowObject: false);
46+
var enumerable = mlContext.Data.CreateEnumerable<SamplesUtils.DatasetUtils.BinaryLabelFloatFeatureVectorFloatWeightSample>(resample, reuseRowObject: false);
4747
Console.WriteLine($"Label\tFeatures[0]");
4848
foreach (var row in enumerable)
4949
{

docs/samples/Microsoft.ML.Samples/Dynamic/Normalizer.cs

+3-5
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
using System;
22
using System.Collections.Generic;
3-
using System.Linq;
43
using Microsoft.ML.Data;
5-
using Microsoft.ML.Transforms;
64

75
namespace Microsoft.ML.Samples.Dynamic
86
{
@@ -28,7 +26,7 @@ public static void Example()
2826
// 35 1 6-11yrs 1 3 32 5 ...
2927

3028
// A pipeline for normalizing the Induced column.
31-
var pipeline = ml.Transforms.Normalize("Induced");
29+
var pipeline = ml.Transforms.NormalizeMinMax("Induced");
3230
// The transformed (normalized according to Normalizer.NormalizerMode.MinMax) data.
3331
var transformer = pipeline.Fit(trainData);
3432

@@ -58,8 +56,8 @@ public static void Example()
5856

5957
// Composing a different pipeline if we wanted to normalize more than one column at a time.
6058
// Using log scale as the normalization mode.
61-
var multiColPipeline = ml.Transforms.Normalize("LogInduced", "Induced", NormalizingEstimator.NormalizationMode.LogMeanVariance)
62-
.Append(ml.Transforms.Normalize("LogSpontaneous", "Spontaneous", NormalizingEstimator.NormalizationMode.LogMeanVariance));
59+
var multiColPipeline = ml.Transforms.NormalizeMinMax("LogInduced", "Induced")
60+
.Append(ml.Transforms.NormalizeMinMax("LogSpontaneous", "Spontaneous"));
6361
// The transformed data.
6462
var multiColtransformer = multiColPipeline.Fit(trainData);
6563
var multiColtransformedData = multiColtransformer.Transform(trainData);

docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance/PFIRegressionExample.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ public static void Example()
1919
// Normalize the data set so that for each feature, its maximum value is 1 while its minimum value is 0.
2020
// Then append a linear regression trainer.
2121
var pipeline = mlContext.Transforms.Concatenate("Features", featureNames)
22-
.Append(mlContext.Transforms.Normalize("Features"))
22+
.Append(mlContext.Transforms.NormalizeMinMax("Features"))
2323
.Append(mlContext.Regression.Trainers.Ols(
2424
labelColumnName: labelName, featureColumnName: "Features"));
2525
var model = pipeline.Fit(data);

docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance/PfiBinaryClassificationExample.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ public static void Example()
2121
// Normalize the data set so that for each feature, its maximum value is 1 while its minimum value is 0.
2222
// Then append a logistic regression trainer.
2323
var pipeline = mlContext.Transforms.Concatenate("Features", featureNames)
24-
.Append(mlContext.Transforms.Normalize("Features"))
24+
.Append(mlContext.Transforms.NormalizeMinMax("Features"))
2525
.Append(mlContext.BinaryClassification.Trainers.LbfgsLogisticRegression(
2626
labelColumnName: labelName, featureColumnName: "Features"));
2727
var model = pipeline.Fit(data);

docs/samples/Microsoft.ML.Samples/Dynamic/Trainers/BinaryClassification/StochasticDualCoordinateAscentNonCalibrated.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ public static class StochasticDualCoordinateAscentNonCalibrated
99
public static void Example()
1010
{
1111
// Generate IEnumerable<BinaryLabelFloatFeatureVectorSample> as training examples.
12-
var rawData = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorSamples(100);
12+
var rawData = SamplesUtils.DatasetUtils.GenerateBinaryLabelFloatFeatureVectorFloatWeightSamples(100);
1313

1414
// Information in first example.
1515
// Label: true

src/Microsoft.ML.Core/Data/AnnotationUtils.cs

+10-2
Original file line numberDiff line numberDiff line change
@@ -441,8 +441,16 @@ public static bool TryGetCategoricalFeatureIndices(DataViewSchema schema, int co
441441
public static IEnumerable<SchemaShape.Column> AnnotationsForMulticlassScoreColumn(SchemaShape.Column? labelColumn = null)
442442
{
443443
var cols = new List<SchemaShape.Column>();
444-
if (labelColumn != null && labelColumn.Value.IsKey && NeedsSlotNames(labelColumn.Value))
445-
cols.Add(new SchemaShape.Column(Kinds.SlotNames, SchemaShape.Column.VectorKind.Vector, TextDataViewType.Instance, false));
444+
if (labelColumn != null && labelColumn.Value.IsKey)
445+
{
446+
if (labelColumn.Value.Annotations.TryFindColumn(Kinds.KeyValues, out var metaCol) &&
447+
metaCol.Kind == SchemaShape.Column.VectorKind.Vector)
448+
{
449+
if (metaCol.ItemType is TextDataViewType)
450+
cols.Add(new SchemaShape.Column(Kinds.SlotNames, SchemaShape.Column.VectorKind.Vector, TextDataViewType.Instance, false));
451+
cols.Add(new SchemaShape.Column(Kinds.TrainingLabelValues, SchemaShape.Column.VectorKind.Vector, metaCol.ItemType, false));
452+
}
453+
}
446454
cols.AddRange(GetTrainerOutputAnnotation());
447455
return cols;
448456
}

src/Microsoft.ML.Data/Scorers/MulticlassClassificationScorer.cs

+40-10
Original file line numberDiff line numberDiff line change
@@ -390,7 +390,7 @@ private static ISchemaBoundMapper WrapIfNeeded(IHostEnvironment env, ISchemaBoun
390390
if (trainSchema?.Label == null)
391391
return mapper; // We don't even have a label identified in a training schema.
392392
var keyType = trainSchema.Label.Value.Annotations.Schema.GetColumnOrNull(AnnotationUtils.Kinds.KeyValues)?.Type as VectorDataViewType;
393-
if (keyType == null || !CanWrap(mapper, keyType))
393+
if (keyType == null)
394394
return mapper;
395395

396396
// Great!! All checks pass.
@@ -409,11 +409,19 @@ private static ISchemaBoundMapper WrapIfNeeded(IHostEnvironment env, ISchemaBoun
409409
/// from the model of a bindable mapper)</param>
410410
/// <returns>Whether we can call <see cref="LabelNameBindableMapper.CreateBound{T}"/> with
411411
/// this mapper and expect it to succeed</returns>
412-
internal static bool CanWrap(ISchemaBoundMapper mapper, DataViewType labelNameType)
412+
internal static bool CanWrapTrainingLabels(ISchemaBoundMapper mapper, DataViewType labelNameType)
413+
{
414+
if (GetTypesForWrapping(mapper, labelNameType, AnnotationUtils.Kinds.TrainingLabelValues, out var scoreType))
415+
// Check that the type is vector, and is of compatible size with the score output.
416+
return labelNameType is VectorDataViewType vectorType && vectorType.Size == scoreType.GetVectorSize();
417+
return false;
418+
}
419+
420+
internal static bool GetTypesForWrapping(ISchemaBoundMapper mapper, DataViewType labelNameType, string metaKind, out DataViewType scoreType)
413421
{
414422
Contracts.AssertValue(mapper);
415423
Contracts.AssertValue(labelNameType);
416-
424+
scoreType = null;
417425
ISchemaBoundRowMapper rowMapper = mapper as ISchemaBoundRowMapper;
418426
if (rowMapper == null)
419427
return false; // We could cover this case, but it is of no practical worth as far as I see, so I decline to do so.
@@ -423,12 +431,30 @@ internal static bool CanWrap(ISchemaBoundMapper mapper, DataViewType labelNameTy
423431
var scoreCol = outSchema.GetColumnOrNull(AnnotationUtils.Const.ScoreValueKind.Score);
424432
if (!outSchema.TryGetColumnIndex(AnnotationUtils.Const.ScoreValueKind.Score, out scoreIdx))
425433
return false; // The mapper doesn't even publish a score column to attach the metadata to.
426-
if (outSchema[scoreIdx].Annotations.Schema.GetColumnOrNull(AnnotationUtils.Kinds.SlotNames)?.Type != null)
427-
return false; // The mapper publishes a score column, and already produces its own slot names.
428-
var scoreType = outSchema[scoreIdx].Type;
434+
if (outSchema[scoreIdx].Annotations.Schema.GetColumnOrNull(metaKind)?.Type != null)
435+
return false; // The mapper publishes a score column, and already produces its own metakind.
436+
scoreType = outSchema[scoreIdx].Type;
437+
return true;
438+
}
429439

430-
// Check that the type is vector, and is of compatible size with the score output.
431-
return labelNameType is VectorDataViewType vectorType && vectorType.Size == scoreType.GetVectorSize() && vectorType.ItemType == TextDataViewType.Instance;
440+
/// <summary>
441+
/// This is a utility method used to determine whether <see cref="LabelNameBindableMapper"/>
442+
/// can or should be used to wrap <paramref name="mapper"/>. This will not throw, since the
443+
/// desired behavior in the event that it cannot be wrapped, is to just back off to the original
444+
/// "unwrapped" bound mapper.
445+
/// </summary>
446+
/// <param name="mapper">The mapper we are seeing if we can wrap</param>
447+
/// <param name="labelNameType">The type of the label names from the metadata (either
448+
/// originating from the key value metadata of the training label column, or deserialized
449+
/// from the model of a bindable mapper)</param>
450+
/// <returns>Whether we can call <see cref="LabelNameBindableMapper.CreateBound{T}"/> with
451+
/// this mapper and expect it to succeed</returns>
452+
internal static bool CanWrapSlotNames(ISchemaBoundMapper mapper, DataViewType labelNameType)
453+
{
454+
if (GetTypesForWrapping(mapper, labelNameType, AnnotationUtils.Kinds.SlotNames, out var scoreType))
455+
// Check that the type is vector, and is of compatible size with the score output.
456+
return labelNameType is VectorDataViewType vectorType && vectorType.Size == scoreType.GetVectorSize() && vectorType.ItemType == TextDataViewType.Instance;
457+
return false;
432458
}
433459

434460
internal static ISchemaBoundMapper WrapCore<T>(IHostEnvironment env, ISchemaBoundMapper mapper, RoleMappedSchema trainSchema)
@@ -449,8 +475,12 @@ internal static ISchemaBoundMapper WrapCore<T>(IHostEnvironment env, ISchemaBoun
449475
{
450476
trainSchema.Label.Value.GetKeyValues(ref value);
451477
};
452-
453-
return LabelNameBindableMapper.CreateBound<T>(env, (ISchemaBoundRowMapper)mapper, type as VectorDataViewType, getter, AnnotationUtils.Kinds.SlotNames, CanWrap);
478+
var resultMapper = mapper;
479+
if (CanWrapTrainingLabels(resultMapper, type))
480+
resultMapper = LabelNameBindableMapper.CreateBound<T>(env, (ISchemaBoundRowMapper)resultMapper, type as VectorDataViewType, getter, AnnotationUtils.Kinds.TrainingLabelValues, CanWrapTrainingLabels);
481+
if (CanWrapSlotNames(resultMapper, type))
482+
resultMapper = LabelNameBindableMapper.CreateBound<T>(env, (ISchemaBoundRowMapper)resultMapper, type as VectorDataViewType, getter, AnnotationUtils.Kinds.SlotNames, CanWrapSlotNames);
483+
return resultMapper;
454484
}
455485

456486
[BestFriend]

src/Microsoft.ML.Data/Scorers/PredictedLabelScorerBase.cs

+5-15
Original file line numberDiff line numberDiff line change
@@ -62,22 +62,12 @@ private BindingsImpl(DataViewSchema input, ISchemaBoundRowMapper mapper, string
6262
{
6363
var scoreColMetadata = mapper.OutputSchema[scoreColIndex].Annotations;
6464

65-
var slotColumn = scoreColMetadata.Schema.GetColumnOrNull(AnnotationUtils.Kinds.SlotNames);
66-
if (slotColumn?.Type is VectorDataViewType slotColVecType && (ulong)slotColVecType.Size == predColKeyType.Count)
65+
var trainLabelColumn = scoreColMetadata.Schema.GetColumnOrNull(AnnotationUtils.Kinds.TrainingLabelValues);
66+
if (trainLabelColumn?.Type is VectorDataViewType trainLabelColVecType && (ulong)trainLabelColVecType.Size == predColKeyType.Count)
6767
{
68-
Contracts.Assert(slotColVecType.Size > 0);
69-
_predColMetadata = Utils.MarshalInvoke(KeyValueMetadataFromMetadata<int>, slotColVecType.RawType,
70-
scoreColMetadata, slotColumn.Value);
71-
}
72-
else
73-
{
74-
var trainLabelColumn = scoreColMetadata.Schema.GetColumnOrNull(AnnotationUtils.Kinds.TrainingLabelValues);
75-
if (trainLabelColumn?.Type is VectorDataViewType trainLabelColVecType && (ulong)trainLabelColVecType.Size == predColKeyType.Count)
76-
{
77-
Contracts.Assert(trainLabelColVecType.Size > 0);
78-
_predColMetadata = Utils.MarshalInvoke(KeyValueMetadataFromMetadata<int>, trainLabelColVecType.RawType,
79-
scoreColMetadata, trainLabelColumn.Value);
80-
}
68+
Contracts.Assert(trainLabelColVecType.Size > 0);
69+
_predColMetadata = Utils.MarshalInvoke(KeyValueMetadataFromMetadata<int>, trainLabelColVecType.RawType,
70+
scoreColMetadata, trainLabelColumn.Value);
8171
}
8272
}
8373
}

src/Microsoft.ML.Data/Transforms/ColumnConcatenatingEstimator.cs

+2-2
Original file line numberDiff line numberDiff line change
@@ -87,12 +87,12 @@ private SchemaShape.Column CheckInputsAndMakeColumn(
8787
// Appending keys makes no real sense anyway.
8888
if (col.IsKey)
8989
{
90-
throw _host.Except($"Column '{sources[i]}' is key." +
90+
throw _host.Except($"Column '{sources[i]}' is key. " +
9191
$"Concatenation of keys is unsupported.");
9292
}
9393
if (!col.ItemType.Equals(itemType))
9494
{
95-
throw _host.Except($"Column '{sources[i]}' has values of {col.ItemType}" +
95+
throw _host.Except($"Column '{sources[i]}' has values of {col.ItemType}, " +
9696
$"which is not the same as earlier observed type of {itemType}.");
9797
}
9898
varVector |= col.Kind == SchemaShape.Column.VectorKind.VariableVector;

src/Microsoft.ML.Data/Transforms/Normalizer.cs

+2-1
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,8 @@ internal static class Defaults
3939
public const long MaximumExampleCount = 1000000000;
4040
}
4141

42-
public enum NormalizationMode
42+
[BestFriend]
43+
internal enum NormalizationMode
4344
{
4445
/// <summary>
4546
/// Linear rescale such that minimum and maximum values are mapped between -1 and 1.

0 commit comments

Comments
 (0)