From a43d1a52aaa5a97017d7984f4f36ff0013489cb7 Mon Sep 17 00:00:00 2001 From: Yael Dekel Date: Wed, 23 May 2018 14:22:32 -0700 Subject: [PATCH 1/2] Remove label requirement for PCA anomaly detector entry point. --- src/Microsoft.ML.PCA/PcaTrainer.cs | 10 +- src/Microsoft.ML/CSharpApi.cs | 279 ++++++++++++++++++ .../UnitTests/TestEntryPoints.cs | 2 +- .../Microsoft.ML.Tests.csproj | 1 + 4 files changed, 287 insertions(+), 5 deletions(-) diff --git a/src/Microsoft.ML.PCA/PcaTrainer.cs b/src/Microsoft.ML.PCA/PcaTrainer.cs index 255f90e61b..e46cc120fb 100644 --- a/src/Microsoft.ML.PCA/PcaTrainer.cs +++ b/src/Microsoft.ML.PCA/PcaTrainer.cs @@ -49,7 +49,7 @@ public sealed class RandomizedPcaTrainer : TrainerBase WeightColumn = Optional.Implicit(DefaultColumnNames.Weight); } private int _dimension; @@ -294,8 +297,7 @@ public static CommonOutputs.AnomalyDetectionOutput TrainPcaAnomaly(IHostEnvironm return LearnerEntryPointsUtils.Train(host, input, () => new RandomizedPcaTrainer(host, input), - () => LearnerEntryPointsUtils.FindColumn(host, input.TrainingData.Schema, input.LabelColumn), - () => LearnerEntryPointsUtils.FindColumn(host, input.TrainingData.Schema, input.WeightColumn)); + getWeight: () => LearnerEntryPointsUtils.FindColumn(host, input.TrainingData.Schema, input.WeightColumn)); } } diff --git a/src/Microsoft.ML/CSharpApi.cs b/src/Microsoft.ML/CSharpApi.cs index 317ee98db0..f1ea0a2d4b 100644 --- a/src/Microsoft.ML/CSharpApi.cs +++ b/src/Microsoft.ML/CSharpApi.cs @@ -550,6 +550,18 @@ public void Add(Microsoft.ML.Trainers.OrdinaryLeastSquaresRegressor input, Micro _jsonNodes.Add(Serialize("Trainers.OrdinaryLeastSquaresRegressor", input, output)); } + public Microsoft.ML.Trainers.PcaAnomalyDetector.Output Add(Microsoft.ML.Trainers.PcaAnomalyDetector input) + { + var output = new Microsoft.ML.Trainers.PcaAnomalyDetector.Output(); + Add(input, output); + return output; + } + + public void Add(Microsoft.ML.Trainers.PcaAnomalyDetector input, Microsoft.ML.Trainers.PcaAnomalyDetector.Output output) + { + _jsonNodes.Add(Serialize("Trainers.PcaAnomalyDetector", input, output)); + } + public Microsoft.ML.Trainers.PoissonRegressor.Output Add(Microsoft.ML.Trainers.PoissonRegressor input) { var output = new Microsoft.ML.Trainers.PoissonRegressor.Output(); @@ -1090,6 +1102,18 @@ public void Add(Microsoft.ML.Transforms.OptionalColumnCreator input, Microsoft.M _jsonNodes.Add(Serialize("Transforms.OptionalColumnCreator", input, output)); } + public Microsoft.ML.Transforms.PcaCalculator.Output Add(Microsoft.ML.Transforms.PcaCalculator input) + { + var output = new Microsoft.ML.Transforms.PcaCalculator.Output(); + Add(input, output); + return output; + } + + public void Add(Microsoft.ML.Transforms.PcaCalculator input, Microsoft.ML.Transforms.PcaCalculator.Output output) + { + _jsonNodes.Add(Serialize("Transforms.PcaCalculator", input, output)); + } + public Microsoft.ML.Transforms.PredictedLabelColumnOriginalValueConverter.Output Add(Microsoft.ML.Transforms.PredictedLabelColumnOriginalValueConverter input) { var output = new Microsoft.ML.Transforms.PredictedLabelColumnOriginalValueConverter.Output(); @@ -6739,6 +6763,97 @@ public OrdinaryLeastSquaresRegressorPipelineStep(Output output) } } + namespace Trainers + { + + /// + /// Train an PCA Anomaly model. + /// + public sealed partial class PcaAnomalyDetector : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITrainerInput, Microsoft.ML.ILearningPipelineItem + { + + + /// + /// The number of components in the PCA + /// + [TlcModule.SweepableDiscreteParamAttribute("Rank", new object[]{10, 20, 40, 80})] + public int Rank { get; set; } = 20; + + /// + /// Oversampling parameter for randomized PCA training + /// + [TlcModule.SweepableDiscreteParamAttribute("Oversampling", new object[]{10, 20, 40})] + public int Oversampling { get; set; } = 20; + + /// + /// If enabled, data is centered to be zero mean + /// + [TlcModule.SweepableDiscreteParamAttribute("Center", new object[]{false, true})] + public bool Center { get; set; } = true; + + /// + /// The seed for random number generation + /// + public int? Seed { get; set; } + + /// + /// Column to use for example weight + /// + public Microsoft.ML.Runtime.EntryPoints.Optional WeightColumn { get; set; } + + /// + /// The data to be used for training + /// + public Var TrainingData { get; set; } = new Var(); + + /// + /// Column to use for features + /// + public string FeatureColumn { get; set; } = "Features"; + + /// + /// Normalize option for the feature column + /// + public Models.NormalizeOption NormalizeFeatures { get; set; } = Models.NormalizeOption.Auto; + + /// + /// Whether learner should cache input training data + /// + public Models.CachingOptions Caching { get; set; } = Models.CachingOptions.Auto; + + + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.IAnomalyDetectionOutput, Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITrainerOutput + { + /// + /// The trained model + /// + public Var PredictorModel { get; set; } = new Var(); + + } + public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment) + { + if (!(previousStep is ILearningPipelineDataStep dataStep)) + { + throw new InvalidOperationException($"{ nameof(PcaAnomalyDetector)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + } + + TrainingData = dataStep.Data; + Output output = experiment.Add(this); + return new PcaAnomalyDetectorPipelineStep(output); + } + + private class PcaAnomalyDetectorPipelineStep : ILearningPipelinePredictorStep + { + public PcaAnomalyDetectorPipelineStep(Output output) + { + Model = output.PredictorModel; + } + + public Var Model { get; } + } + } + } + namespace Trainers { @@ -11417,6 +11532,170 @@ public OptionalColumnCreatorPipelineStep(Output output) } } + namespace Transforms + { + + public sealed partial class PcaTransformColumn : OneToOneColumn, IOneToOneColumn + { + /// + /// The name of the weight column + /// + public string WeightColumn { get; set; } + + /// + /// The number of components in the PCA + /// + public int? Rank { get; set; } + + /// + /// Oversampling parameter for randomized PCA training + /// + public int? Oversampling { get; set; } + + /// + /// If enabled, data is centered to be zero mean + /// + public bool? Center { get; set; } + + /// + /// The seed for random number generation + /// + public int? Seed { get; set; } + + /// + /// Name of the new column + /// + public string Name { get; set; } + + /// + /// Name of the source column + /// + public string Source { get; set; } + + } + + /// + /// Train an PCA Anomaly model. + /// + public sealed partial class PcaCalculator : Microsoft.ML.Runtime.EntryPoints.CommonInputs.ITransformInput, Microsoft.ML.ILearningPipelineItem + { + + public PcaCalculator() + { + } + + public PcaCalculator(params string[] inputColumns) + { + if (inputColumns != null) + { + foreach (string input in inputColumns) + { + AddColumn(input); + } + } + } + + public PcaCalculator(params ValueTuple[] inputOutputColumns) + { + if (inputOutputColumns != null) + { + foreach (ValueTuple inputOutput in inputOutputColumns) + { + AddColumn(inputOutput.Item2, inputOutput.Item1); + } + } + } + + public void AddColumn(string source) + { + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(source)); + Column = list.ToArray(); + } + + public void AddColumn(string name, string source) + { + var list = Column == null ? new List() : new List(Column); + list.Add(OneToOneColumn.Create(name, source)); + Column = list.ToArray(); + } + + + /// + /// New column definition(s) (optional form: name:src) + /// + public Transforms.PcaTransformColumn[] Column { get; set; } + + /// + /// The name of the weight column + /// + public string WeightColumn { get; set; } + + /// + /// The number of components in the PCA + /// + public int Rank { get; set; } = 20; + + /// + /// Oversampling parameter for randomized PCA training + /// + public int Oversampling { get; set; } = 20; + + /// + /// If enabled, data is centered to be zero mean + /// + public bool Center { get; set; } = true; + + /// + /// The seed for random number generation + /// + public int Seed { get; set; } + + /// + /// Input dataset + /// + public Var Data { get; set; } = new Var(); + + + public sealed class Output : Microsoft.ML.Runtime.EntryPoints.CommonOutputs.ITransformOutput + { + /// + /// Transformed dataset + /// + public Var OutputData { get; set; } = new Var(); + + /// + /// Transform model + /// + public Var Model { get; set; } = new Var(); + + } + public ILearningPipelineStep ApplyStep(ILearningPipelineStep previousStep, Experiment experiment) + { + if (!(previousStep is ILearningPipelineDataStep dataStep)) + { + throw new InvalidOperationException($"{ nameof(PcaCalculator)} only supports an { nameof(ILearningPipelineDataStep)} as an input."); + } + + Data = dataStep.Data; + Output output = experiment.Add(this); + return new PcaCalculatorPipelineStep(output); + } + + private class PcaCalculatorPipelineStep : ILearningPipelineDataStep + { + public PcaCalculatorPipelineStep(Output output) + { + Data = output.OutputData; + Model = output.Model; + } + + public Var Data { get; } + public Var Model { get; } + } + } + } + namespace Transforms { diff --git a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs index 24e8374b4c..aa1a04d7e9 100644 --- a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs +++ b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs @@ -1083,7 +1083,7 @@ public void EntryPointLogisticRegressionMultiClass() [Fact] public void EntryPointPcaAnomaly() { - TestEntryPointRoutine("MNIST.Train.0-class.tiny.txt", "Trainers.PcaAnomalyDetector"); + TestEntryPointRoutine("MNIST.Train.0-class.tiny.txt", "Trainers.PcaAnomalyDetector", "col=Features:R4:1-784"); } [Fact] diff --git a/test/Microsoft.ML.Tests/Microsoft.ML.Tests.csproj b/test/Microsoft.ML.Tests/Microsoft.ML.Tests.csproj index 59c6d8f6c6..707f0512c4 100644 --- a/test/Microsoft.ML.Tests/Microsoft.ML.Tests.csproj +++ b/test/Microsoft.ML.Tests/Microsoft.ML.Tests.csproj @@ -1,5 +1,6 @@  + From 010b709ed18ef7effe7c5143c27b1a180896158e Mon Sep 17 00:00:00 2001 From: Yael Dekel Date: Wed, 23 May 2018 14:35:48 -0700 Subject: [PATCH 2/2] Fix EntryPointCatalog test. --- ZBaselines/Common/EntryPoints/core_manifest.json | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/ZBaselines/Common/EntryPoints/core_manifest.json b/ZBaselines/Common/EntryPoints/core_manifest.json index 6eeb1bf709..d705b26010 100644 --- a/ZBaselines/Common/EntryPoints/core_manifest.json +++ b/ZBaselines/Common/EntryPoints/core_manifest.json @@ -10585,18 +10585,6 @@ "IsNullable": false, "Default": "Features" }, - { - "Name": "LabelColumn", - "Type": "String", - "Desc": "Column to use for labels", - "Aliases": [ - "lab" - ], - "Required": false, - "SortOrder": 3.0, - "IsNullable": false, - "Default": "Label" - }, { "Name": "WeightColumn", "Type": "String", @@ -10727,8 +10715,6 @@ } ], "InputKind": [ - "ITrainerInputWithWeight", - "ITrainerInputWithLabel", "ITrainerInput" ], "OutputKind": [