From 6fd888d969e8113b981b2aa9b24f85eb4084caaf Mon Sep 17 00:00:00 2001 From: Ivan Matantsev Date: Thu, 24 May 2018 10:55:59 -0700 Subject: [PATCH 1/2] introduce IUnsupervisedLearningWithWeights --- .../Common/EntryPoints/core_manifest.json | 31 +++++++++++++++++++ .../EntryPoints/InputBase.cs | 18 +++++++++++ .../KMeansPlusPlusTrainer.cs | 5 +-- src/Microsoft.ML.PCA/PcaTrainer.cs | 5 +-- 4 files changed, 53 insertions(+), 6 deletions(-) diff --git a/ZBaselines/Common/EntryPoints/core_manifest.json b/ZBaselines/Common/EntryPoints/core_manifest.json index d705b26010..54bc877afa 100644 --- a/ZBaselines/Common/EntryPoints/core_manifest.json +++ b/ZBaselines/Common/EntryPoints/core_manifest.json @@ -8957,6 +8957,18 @@ "IsNullable": false, "Default": "Features" }, + { + "Name": "WeightColumn", + "Type": "String", + "Desc": "Column to use for example weight", + "Aliases": [ + "weight" + ], + "Required": false, + "SortOrder": 4.0, + "IsNullable": false, + "Default": "Weight" + }, { "Name": "NormalizeFeatures", "Type": { @@ -9093,6 +9105,7 @@ } ], "InputKind": [ + "IUnsupervisedTrainerWithWeight", "ITrainerInput" ], "OutputKind": [ @@ -10715,6 +10728,7 @@ } ], "InputKind": [ + "IUnsupervisedTrainerWithWeight", "ITrainerInput" ], "OutputKind": [ @@ -22417,6 +22431,23 @@ "Type": "TransformModel" } ] + }, + { + "Kind": "IUnsupervisedTrainerWithWeight", + "Settings": [ + { + "Name": "WeightColumn", + "Type": "String" + }, + { + "Name": "TrainingData", + "Type": "DataView" + }, + { + "Name": "FeatureColumn", + "Type": "String" + } + ] } ] } \ No newline at end of file diff --git a/src/Microsoft.ML.Data/EntryPoints/InputBase.cs b/src/Microsoft.ML.Data/EntryPoints/InputBase.cs index 57a7c9120f..5583c66df0 100644 --- a/src/Microsoft.ML.Data/EntryPoints/InputBase.cs +++ b/src/Microsoft.ML.Data/EntryPoints/InputBase.cs @@ -69,6 +69,16 @@ public abstract class LearnerInputBaseWithWeight : LearnerInputBaseWithLabel public Optional WeightColumn = Optional.Implicit(DefaultColumnNames.Weight); } + /// + /// The base class for all unsupervised learner inputs that support a weight column. + /// + [TlcModule.EntryPointKind(typeof(CommonInputs.IUnsupervisedTrainerWithWeight))] + public abstract class UnsupervisedLearnerInputBaseWithWeight : LearnerInputBase + { + [Argument(ArgumentType.AtMostOnce, HelpText = "Column to use for example weight", ShortName = "weight", SortOrder = 4, Visibility = ArgumentAttribute.VisibilityType.EntryPointsOnly)] + public Optional WeightColumn = Optional.Implicit(DefaultColumnNames.Weight); + } + /// /// The base class for all evaluators inputs. /// @@ -224,6 +234,14 @@ public interface ITrainerInputWithLabel : ITrainerInput string LabelColumn { get; } } + /// + /// Interface that all API trainer input classes will implement. + /// + public interface IUnsupervisedTrainerWithWeight : ITrainerInput + { + Optional WeightColumn { get; } + } + /// /// Interface that all API trainer input classes will implement. /// diff --git a/src/Microsoft.ML.KMeansClustering/KMeansPlusPlusTrainer.cs b/src/Microsoft.ML.KMeansClustering/KMeansPlusPlusTrainer.cs index b833278890..627d176d85 100644 --- a/src/Microsoft.ML.KMeansClustering/KMeansPlusPlusTrainer.cs +++ b/src/Microsoft.ML.KMeansClustering/KMeansPlusPlusTrainer.cs @@ -44,7 +44,7 @@ public enum InitAlgorithm KMeansParallel = 2 } - public class Arguments : LearnerInputBase + public class Arguments : UnsupervisedLearnerInputBaseWithWeight { [Argument(ArgumentType.AtMostOnce, HelpText = "The number of clusters", SortOrder = 50)] [TGUI(SuggestedSweeps = "5,10,20,40")] @@ -234,7 +234,8 @@ public static CommonOutputs.ClusteringOutput TrainKMeans(IHostEnvironment env, A EntryPointUtils.CheckInputArgs(host, input); return LearnerEntryPointsUtils.Train(host, input, - () => new KMeansPlusPlusTrainer(host, input)); + () => new KMeansPlusPlusTrainer(host, input), + getWeight: () => LearnerEntryPointsUtils.FindColumn(host, input.TrainingData.Schema, input.WeightColumn)); } } diff --git a/src/Microsoft.ML.PCA/PcaTrainer.cs b/src/Microsoft.ML.PCA/PcaTrainer.cs index e46cc120fb..1d34db4b21 100644 --- a/src/Microsoft.ML.PCA/PcaTrainer.cs +++ b/src/Microsoft.ML.PCA/PcaTrainer.cs @@ -49,7 +49,7 @@ public sealed class RandomizedPcaTrainer : TrainerBase WeightColumn = Optional.Implicit(DefaultColumnNames.Weight); } private int _dimension; From 35e44b66aef0c1d165f8779a1294a0b0780bc93d Mon Sep 17 00:00:00 2001 From: Ivan Matantsev Date: Thu, 24 May 2018 13:02:32 -0700 Subject: [PATCH 2/2] add test to check KMeans don't need label and can handle presence of weight column. also extract real weight value from cursor. --- src/Microsoft.ML.KMeansClustering/KMeansPlusPlusTrainer.cs | 2 +- test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Microsoft.ML.KMeansClustering/KMeansPlusPlusTrainer.cs b/src/Microsoft.ML.KMeansClustering/KMeansPlusPlusTrainer.cs index 627d176d85..1f09ec850f 100644 --- a/src/Microsoft.ML.KMeansClustering/KMeansPlusPlusTrainer.cs +++ b/src/Microsoft.ML.KMeansClustering/KMeansPlusPlusTrainer.cs @@ -162,7 +162,7 @@ private void TrainCore(IChannel ch, RoleMappedData data) long missingFeatureCount; long totalTrainingInstances; - var cursorFactory = new FeatureFloatVectorCursor.Factory(data, CursOpt.Features | CursOpt.Id); + var cursorFactory = new FeatureFloatVectorCursor.Factory(data, CursOpt.Features | CursOpt.Id | CursOpt.Weight); // REVIEW: It would be nice to extract these out into subcomponents in the future. We should // revisit and even consider breaking these all into individual KMeans-flavored trainers, they // all produce a valid set of output centroids with various trade-offs in runtime (with perhaps diff --git a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs index 0ef21b3ef8..b91fd9365b 100644 --- a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs +++ b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs @@ -1918,7 +1918,7 @@ public void EntryPointTrainTestMacroNoTransformInput() [Fact] public void EntryPointKMeans() { - TestEntryPointRoutine("Train-Tiny-28x28.txt", "Trainers.KMeansPlusPlusClusterer"); + TestEntryPointRoutine("Train-Tiny-28x28.txt", "Trainers.KMeansPlusPlusClusterer", "col=Weight:R4:0 col=Features:R4:1-784", ",'InitAlgorithm':'KMeansPlusPlus'"); } [Fact]