diff --git a/src/Microsoft.ML.Data/Transforms/BootstrapSamplingTransformer.cs b/src/Microsoft.ML.Data/Transforms/BootstrapSamplingTransformer.cs index d44f855f1b..cb61c00db1 100644 --- a/src/Microsoft.ML.Data/Transforms/BootstrapSamplingTransformer.cs +++ b/src/Microsoft.ML.Data/Transforms/BootstrapSamplingTransformer.cs @@ -13,7 +13,7 @@ using Microsoft.ML.Model; using Microsoft.ML.Transforms; -[assembly: LoadableClass(BootstrapSamplingTransformer.Summary, typeof(BootstrapSamplingTransformer), typeof(BootstrapSamplingTransformer.Arguments), typeof(SignatureDataTransform), +[assembly: LoadableClass(BootstrapSamplingTransformer.Summary, typeof(BootstrapSamplingTransformer), typeof(BootstrapSamplingTransformer.Options), typeof(SignatureDataTransform), BootstrapSamplingTransformer.UserName, "BootstrapSampleTransform", "BootstrapSample")] [assembly: LoadableClass(BootstrapSamplingTransformer.Summary, typeof(BootstrapSamplingTransformer), null, typeof(SignatureLoadDataTransform), @@ -36,7 +36,7 @@ internal static class Defaults public const int PoolSize = 1000; } - public sealed class Arguments : TransformInputBase + public sealed class Options : TransformInputBase { [Argument(ArgumentType.AtMostOnce, HelpText = "Whether this is the out-of-bag sample, that is, all those rows that are not selected by the transform.", ShortName = "comp")] @@ -76,16 +76,16 @@ private static VersionInfo GetVersionInfo() private readonly bool _shuffleInput; private readonly int _poolSize; - public BootstrapSamplingTransformer(IHostEnvironment env, Arguments args, IDataView input) + public BootstrapSamplingTransformer(IHostEnvironment env, Options options, IDataView input) : base(env, RegistrationName, input) { - Host.CheckValue(args, nameof(args)); - Host.CheckUserArg(args.PoolSize >= 0, nameof(args.PoolSize), "Cannot be negative"); + Host.CheckValue(options, nameof(options)); + Host.CheckUserArg(options.PoolSize >= 0, nameof(options.PoolSize), "Cannot be negative"); - _complement = args.Complement; - _state = new TauswortheHybrid.State(args.Seed ?? (uint)Host.Rand.Next()); - _shuffleInput = args.ShuffleInput; - _poolSize = args.PoolSize; + _complement = options.Complement; + _state = new TauswortheHybrid.State(options.Seed ?? (uint)Host.Rand.Next()); + _shuffleInput = options.ShuffleInput; + _poolSize = options.PoolSize; } /// @@ -103,7 +103,7 @@ public BootstrapSamplingTransformer(IHostEnvironment env, uint? seed = null, bool shuffleInput = Defaults.ShuffleInput, int poolSize = Defaults.PoolSize) - : this(env, new Arguments() { Complement = complement, Seed = seed, ShuffleInput = shuffleInput, PoolSize = poolSize }, input) + : this(env, new Options() { Complement = complement, Seed = seed, ShuffleInput = shuffleInput, PoolSize = poolSize }, input) { } @@ -242,7 +242,7 @@ protected override bool MoveNextCore() internal static class BootstrapSample { [TlcModule.EntryPoint(Name = "Transforms.ApproximateBootstrapSampler", Desc = BootstrapSamplingTransformer.Summary, UserName = BootstrapSamplingTransformer.UserName, ShortName = BootstrapSamplingTransformer.RegistrationName)] - public static CommonOutputs.TransformOutput GetSample(IHostEnvironment env, BootstrapSamplingTransformer.Arguments input) + public static CommonOutputs.TransformOutput GetSample(IHostEnvironment env, BootstrapSamplingTransformer.Options input) { Contracts.CheckValue(env, nameof(env)); env.CheckValue(input, nameof(input)); diff --git a/src/Microsoft.ML.Data/Transforms/RowShufflingTransformer.cs b/src/Microsoft.ML.Data/Transforms/RowShufflingTransformer.cs index 7f08930430..359067c01b 100644 --- a/src/Microsoft.ML.Data/Transforms/RowShufflingTransformer.cs +++ b/src/Microsoft.ML.Data/Transforms/RowShufflingTransformer.cs @@ -15,7 +15,7 @@ using Microsoft.ML.Model; using Microsoft.ML.Transforms; -[assembly: LoadableClass(RowShufflingTransformer.Summary, typeof(RowShufflingTransformer), typeof(RowShufflingTransformer.Arguments), typeof(SignatureDataTransform), +[assembly: LoadableClass(RowShufflingTransformer.Summary, typeof(RowShufflingTransformer), typeof(RowShufflingTransformer.Options), typeof(SignatureDataTransform), "Shuffle Transform", "ShuffleTransform", "Shuffle", "shuf")] [assembly: LoadableClass(RowShufflingTransformer.Summary, typeof(RowShufflingTransformer), null, typeof(SignatureLoadDataTransform), @@ -30,7 +30,8 @@ namespace Microsoft.ML.Transforms /// rows in the input cursor, and then, successively, the output cursor will yield one /// of these rows and replace it with another row from the input. /// - public sealed class RowShufflingTransformer : RowToRowTransformBase + [BestFriend] + internal sealed class RowShufflingTransformer : RowToRowTransformBase { private static class Defaults { @@ -39,7 +40,7 @@ private static class Defaults public const bool ForceShuffle = false; } - public sealed class Arguments + public sealed class Options { // REVIEW: A more intelligent heuristic, based on the expected size of the inputs, perhaps? [Argument(ArgumentType.LastOccurenceWins, HelpText = "The pool will have this many rows", ShortName = "rows")] @@ -99,14 +100,14 @@ public RowShufflingTransformer(IHostEnvironment env, int poolRows = Defaults.PoolRows, bool poolOnly = Defaults.PoolOnly, bool forceShuffle = Defaults.ForceShuffle) - : this(env, new Arguments() { PoolRows = poolRows, PoolOnly = poolOnly, ForceShuffle = forceShuffle }, input) + : this(env, new Options() { PoolRows = poolRows, PoolOnly = poolOnly, ForceShuffle = forceShuffle }, input) { } /// /// Public constructor corresponding to SignatureDataTransform. /// - public RowShufflingTransformer(IHostEnvironment env, Arguments args, IDataView input) + public RowShufflingTransformer(IHostEnvironment env, Options args, IDataView input) : base(env, RegistrationName, input) { Host.CheckValue(args, nameof(args)); diff --git a/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/BootstrapSelector.cs b/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/BootstrapSelector.cs index 149fb91159..c3a98e47c8 100644 --- a/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/BootstrapSelector.cs +++ b/src/Microsoft.ML.Ensemble/Selector/SubsetSelector/BootstrapSelector.cs @@ -47,7 +47,7 @@ public override IEnumerable GetSubsets(Batch batch, Random rand) for (int i = 0; i < Size; i++) { // REVIEW: Consider ways to reintroduce "balanced" samples. - var viewTrain = new BootstrapSamplingTransformer(Host, new BootstrapSamplingTransformer.Arguments(), Data.Data); + var viewTrain = new BootstrapSamplingTransformer(Host, new BootstrapSamplingTransformer.Options(), Data.Data); var dataTrain = new RoleMappedData(viewTrain, Data.Schema.GetColumnRoleNames()); yield return FeatureSelector.SelectFeatures(dataTrain, rand); } diff --git a/src/Microsoft.ML.HalLearners.StaticPipe/VectorWhiteningStaticExtensions.cs b/src/Microsoft.ML.HalLearners.StaticPipe/VectorWhiteningStaticExtensions.cs index fad38dc3a5..b6f2b419a4 100644 --- a/src/Microsoft.ML.HalLearners.StaticPipe/VectorWhiteningStaticExtensions.cs +++ b/src/Microsoft.ML.HalLearners.StaticPipe/VectorWhiteningStaticExtensions.cs @@ -49,9 +49,9 @@ public override IEstimator Reconcile(IHostEnvironment env, { Contracts.Assert(toOutput.Length == 1); - var infos = new VectorWhiteningTransformer.ColumnInfo[toOutput.Length]; + var infos = new VectorWhiteningEstimator.ColumnInfo[toOutput.Length]; for (int i = 0; i < toOutput.Length; i++) - infos[i] = new VectorWhiteningTransformer.ColumnInfo(outputNames[toOutput[i]], inputNames[((OutPipelineColumn)toOutput[i]).Input], _kind, _eps, _maxRows, _pcaNum); + infos[i] = new VectorWhiteningEstimator.ColumnInfo(outputNames[toOutput[i]], inputNames[((OutPipelineColumn)toOutput[i]).Input], _kind, _eps, _maxRows, _pcaNum); return new VectorWhiteningEstimator(env, infos); } @@ -63,9 +63,9 @@ public override IEstimator Reconcile(IHostEnvironment env, /// Maximum number of rows used to train the transform. /// In case of PCA whitening, indicates the number of components to retain. public static Vector PcaWhitening(this Vector input, - float eps = VectorWhiteningTransformer.Defaults.Eps, - int maxRows = VectorWhiteningTransformer.Defaults.MaxRows, - int pcaNum = VectorWhiteningTransformer.Defaults.PcaNum) + float eps = VectorWhiteningEstimator.Defaults.Eps, + int maxRows = VectorWhiteningEstimator.Defaults.MaxRows, + int pcaNum = VectorWhiteningEstimator.Defaults.PcaNum) => new OutPipelineColumn(input, WhiteningKind.Pca, eps, maxRows, pcaNum); /// @@ -73,8 +73,8 @@ public static Vector PcaWhitening(this Vector input, /// Whitening constant, prevents division by zero. /// Maximum number of rows used to train the transform. public static Vector ZcaWhitening(this Vector input, - float eps = VectorWhiteningTransformer.Defaults.Eps, - int maxRows = VectorWhiteningTransformer.Defaults.MaxRows) - => new OutPipelineColumn(input, WhiteningKind.Zca, eps, maxRows, VectorWhiteningTransformer.Defaults.PcaNum); + float eps = VectorWhiteningEstimator.Defaults.Eps, + int maxRows = VectorWhiteningEstimator.Defaults.MaxRows) + => new OutPipelineColumn(input, WhiteningKind.Zca, eps, maxRows, VectorWhiteningEstimator.Defaults.PcaNum); } } diff --git a/src/Microsoft.ML.HalLearners/HalLearnersCatalog.cs b/src/Microsoft.ML.HalLearners/HalLearnersCatalog.cs index 9f6f16ab51..1b635d49fc 100644 --- a/src/Microsoft.ML.HalLearners/HalLearnersCatalog.cs +++ b/src/Microsoft.ML.HalLearners/HalLearnersCatalog.cs @@ -112,10 +112,10 @@ public static SymSgdClassificationTrainer SymbolicStochasticGradientDescent( /// /// public static VectorWhiteningEstimator VectorWhiten(this TransformsCatalog.ProjectionTransforms catalog, string outputColumnName, string inputColumnName = null, - WhiteningKind kind = VectorWhiteningTransformer.Defaults.Kind, - float eps = VectorWhiteningTransformer.Defaults.Eps, - int maxRows = VectorWhiteningTransformer.Defaults.MaxRows, - int pcaNum = VectorWhiteningTransformer.Defaults.PcaNum) + WhiteningKind kind = VectorWhiteningEstimator.Defaults.Kind, + float eps = VectorWhiteningEstimator.Defaults.Eps, + int maxRows = VectorWhiteningEstimator.Defaults.MaxRows, + int pcaNum = VectorWhiteningEstimator.Defaults.PcaNum) => new VectorWhiteningEstimator(CatalogUtils.GetEnvironment(catalog), outputColumnName, inputColumnName, kind, eps, maxRows, pcaNum); /// @@ -124,7 +124,7 @@ public static VectorWhiteningEstimator VectorWhiten(this TransformsCatalog.Proje /// /// The transform's catalog. /// Describes the parameters of the whitening process for each column pair. - public static VectorWhiteningEstimator VectorWhiten(this TransformsCatalog.ProjectionTransforms catalog, params VectorWhiteningTransformer.ColumnInfo[] columns) + public static VectorWhiteningEstimator VectorWhiten(this TransformsCatalog.ProjectionTransforms catalog, params VectorWhiteningEstimator.ColumnInfo[] columns) => new VectorWhiteningEstimator(CatalogUtils.GetEnvironment(catalog), columns); } diff --git a/src/Microsoft.ML.HalLearners/SymSgdClassificationTrainer.cs b/src/Microsoft.ML.HalLearners/SymSgdClassificationTrainer.cs index af5ecd4a11..35a6b55d4f 100644 --- a/src/Microsoft.ML.HalLearners/SymSgdClassificationTrainer.cs +++ b/src/Microsoft.ML.HalLearners/SymSgdClassificationTrainer.cs @@ -141,7 +141,7 @@ private RoleMappedData PrepareDataFromTrainingExamples(IChannel ch, RoleMappedDa idvToFeedTrain = idvToShuffle; else { - var shuffleArgs = new RowShufflingTransformer.Arguments + var shuffleArgs = new RowShufflingTransformer.Options { PoolOnly = false, ForceShuffle = _options.Shuffle diff --git a/src/Microsoft.ML.HalLearners/VectorWhitening.cs b/src/Microsoft.ML.HalLearners/VectorWhitening.cs index 31cdd0b201..f865ba4fcd 100644 --- a/src/Microsoft.ML.HalLearners/VectorWhitening.cs +++ b/src/Microsoft.ML.HalLearners/VectorWhitening.cs @@ -19,7 +19,7 @@ using Microsoft.ML.Model; using Microsoft.ML.Transforms.Projections; -[assembly: LoadableClass(VectorWhiteningTransformer.Summary, typeof(IDataTransform), typeof(VectorWhiteningTransformer), typeof(VectorWhiteningTransformer.Arguments), typeof(SignatureDataTransform), +[assembly: LoadableClass(VectorWhiteningTransformer.Summary, typeof(IDataTransform), typeof(VectorWhiteningTransformer), typeof(VectorWhiteningTransformer.Options), typeof(SignatureDataTransform), VectorWhiteningTransformer.FriendlyName, VectorWhiteningTransformer.LoaderSignature, "Whitening")] [assembly: LoadableClass(VectorWhiteningTransformer.Summary, typeof(IDataTransform), typeof(VectorWhiteningTransformer), null, typeof(SignatureLoadDataTransform), @@ -33,11 +33,18 @@ namespace Microsoft.ML.Transforms.Projections { + /// + /// Which vector whitening technique to use. ZCA whitening ensures that the average covariance between whitened + /// and original variables is maximal. In contrast, PCA whitening lead to maximally compressed whitened variables, as + /// measured by squared covariance. + /// public enum WhiteningKind { + /// PCA whitening. [TGUI(Label = "PCA whitening")] Pca, + /// ZCA whitening. [TGUI(Label = "ZCA whitening")] Zca } @@ -45,42 +52,32 @@ public enum WhiteningKind /// public sealed class VectorWhiteningTransformer : OneToOneTransformerBase { - [BestFriend] - internal static class Defaults - { - public const WhiteningKind Kind = WhiteningKind.Zca; - public const float Eps = 1e-5f; - public const int MaxRows = 100 * 1000; - public const bool SaveInverse = false; - public const int PcaNum = 0; - } - - public sealed class Arguments + internal sealed class Options { [Argument(ArgumentType.Multiple | ArgumentType.Required, HelpText = "New column definition(s) (optional form: name:src)", Name = "Column", ShortName = "col", SortOrder = 1)] public Column[] Columns; [Argument(ArgumentType.AtMostOnce, HelpText = "Whitening kind (PCA/ZCA)")] - public WhiteningKind Kind = Defaults.Kind; + public WhiteningKind Kind = VectorWhiteningEstimator.Defaults.Kind; [Argument(ArgumentType.AtMostOnce, HelpText = "Scaling regularizer")] - public float Eps = Defaults.Eps; + public float Eps = VectorWhiteningEstimator.Defaults.Eps; [Argument(ArgumentType.AtMostOnce, HelpText = "Max number of rows", ShortName = "rows")] - public int MaxRows = Defaults.MaxRows; + public int MaxRows = VectorWhiteningEstimator.Defaults.MaxRows; [Argument(ArgumentType.AtMostOnce, HelpText = "Whether to save inverse (recovery) matrix", ShortName = "saveInv")] - public bool SaveInverse = Defaults.SaveInverse; + public bool SaveInverse = VectorWhiteningEstimator.Defaults.SaveInverse; [Argument(ArgumentType.AtMostOnce, HelpText = "PCA components to retain")] - public int PcaNum = Defaults.PcaNum; + public int PcaNum = VectorWhiteningEstimator.Defaults.PcaNum; // REVIEW: add the following options: // 1. Currently there is no way to apply an inverse transform AFTER the the transform is trained. // 2. How many PCA components to retain/drop. Options: retain-first, drop-first, variance-threshold. } - public sealed class Column : OneToOneColumn + internal sealed class Column : OneToOneColumn { [Argument(ArgumentType.AtMostOnce, HelpText = "Whitening kind (PCA/ZCA)")] public WhiteningKind? Kind; @@ -116,103 +113,6 @@ internal bool TryUnparse(StringBuilder sb) } } - public sealed class ColumnInfo - { - public readonly string Name; - public readonly string InputColumnName; - public readonly WhiteningKind Kind; - public readonly float Epsilon; - public readonly int MaxRow; - public readonly int PcaNum; - internal readonly bool SaveInv; - - /// - /// Describes how the transformer handles one input-output column pair. - /// - /// Name of the column resulting from the transformation of . - /// Name of column to transform. If set to , the value of the will be used as source. - /// Whitening kind (PCA/ZCA). - /// Whitening constant, prevents division by zero. - /// Maximum number of rows used to train the transform. - /// In case of PCA whitening, indicates the number of components to retain. - public ColumnInfo(string name, string inputColumnName = null, WhiteningKind kind = Defaults.Kind, float eps = Defaults.Eps, - int maxRows = Defaults.MaxRows, int pcaNum = Defaults.PcaNum) - { - Name = name; - Contracts.CheckValue(Name, nameof(Name)); - InputColumnName = inputColumnName ?? name; - Contracts.CheckValue(InputColumnName, nameof(InputColumnName)); - Kind = kind; - Contracts.CheckUserArg(Kind == WhiteningKind.Pca || Kind == WhiteningKind.Zca, nameof(Kind)); - Epsilon = eps; - Contracts.CheckUserArg(0 <= Epsilon && Epsilon < float.PositiveInfinity, nameof(Epsilon)); - MaxRow = maxRows; - Contracts.CheckUserArg(MaxRow > 0, nameof(MaxRow)); - SaveInv = Defaults.SaveInverse; - PcaNum = pcaNum; // REVIEW: make it work with pcaNum == 1. - Contracts.CheckUserArg(PcaNum >= 0, nameof(PcaNum)); - } - - internal ColumnInfo(Column item, Arguments args) - { - Name = item.Name; - Contracts.CheckValue(Name, nameof(Name)); - InputColumnName = item.Source ?? item.Name; - Contracts.CheckValue(InputColumnName, nameof(InputColumnName)); - Kind = item.Kind ?? args.Kind; - Contracts.CheckUserArg(Kind == WhiteningKind.Pca || Kind == WhiteningKind.Zca, nameof(item.Kind)); - Epsilon = item.Eps ?? args.Eps; - Contracts.CheckUserArg(0 <= Epsilon && Epsilon < float.PositiveInfinity, nameof(item.Eps)); - MaxRow = item.MaxRows ?? args.MaxRows; - Contracts.CheckUserArg(MaxRow > 0, nameof(item.MaxRows)); - SaveInv = item.SaveInverse ?? args.SaveInverse; - PcaNum = item.PcaNum ?? args.PcaNum; - Contracts.CheckUserArg(PcaNum >= 0, nameof(item.PcaNum)); - } - - internal ColumnInfo(ModelLoadContext ctx) - { - Contracts.AssertValue(ctx); - - // *** Binary format *** - // int: kind - // float: epsilon - // int: maxrow - // byte: saveInv - // int: pcaNum - Kind = (WhiteningKind)ctx.Reader.ReadInt32(); - Contracts.CheckDecode(Kind == WhiteningKind.Pca || Kind == WhiteningKind.Zca); - Epsilon = ctx.Reader.ReadFloat(); - Contracts.CheckDecode(0 <= Epsilon && Epsilon < float.PositiveInfinity); - MaxRow = ctx.Reader.ReadInt32(); - Contracts.CheckDecode(MaxRow > 0); - SaveInv = ctx.Reader.ReadBoolByte(); - PcaNum = ctx.Reader.ReadInt32(); - Contracts.CheckDecode(PcaNum >= 0); - } - - internal void Save(ModelSaveContext ctx) - { - Contracts.AssertValue(ctx); - - // *** Binary format *** - // int: kind - // float: epsilon - // int: maxrow - // byte: saveInv - // int: pcaNum - Contracts.Assert(Kind == WhiteningKind.Pca || Kind == WhiteningKind.Zca); - ctx.Writer.Write((int)Kind); - Contracts.Assert(0 <= Epsilon && Epsilon < float.PositiveInfinity); - ctx.Writer.Write(Epsilon); - Contracts.Assert(MaxRow > 0); - ctx.Writer.Write(MaxRow); - ctx.Writer.WriteBoolByte(SaveInv); - Contracts.Assert(PcaNum >= 0); - ctx.Writer.Write(PcaNum); - } - } - private const Mkl.Layout Layout = Mkl.Layout.RowMajor; // Stores whitening matrix as float[] for each column. _models[i] is the whitening matrix of the i-th input column. @@ -238,7 +138,7 @@ private static VersionInfo GetVersionInfo() loaderAssemblyName: typeof(VectorWhiteningTransformer).Assembly.FullName); } - private readonly ColumnInfo[] _columns; + private readonly VectorWhiteningEstimator.ColumnInfo[] _columns; /// /// Initializes a new object. @@ -247,7 +147,7 @@ private static VersionInfo GetVersionInfo() /// An array of whitening matrices where models[i] is learned from the i-th element of . /// An array of inverse whitening matrices, the i-th element being the inverse matrix of models[i]. /// Describes the parameters of the whitening process for each column pair. - internal VectorWhiteningTransformer(IHostEnvironment env, float[][] models, float[][] invModels, params ColumnInfo[] columns) + internal VectorWhiteningTransformer(IHostEnvironment env, float[][] models, float[][] invModels, params VectorWhiteningEstimator.ColumnInfo[] columns) : base(Contracts.CheckRef(env, nameof(env)).Register(nameof(VectorWhiteningTransformer)), GetColumnPairs(columns)) { Host.AssertNonEmpty(ColumnPairs); @@ -270,9 +170,9 @@ private VectorWhiteningTransformer(IHostEnvironment env, ModelLoadContext ctx) // recovery matrix Host.AssertNonEmpty(ColumnPairs); - _columns = new ColumnInfo[ColumnPairs.Length]; + _columns = new VectorWhiteningEstimator.ColumnInfo[ColumnPairs.Length]; for (int i = 0; i < _columns.Length; i++) - _columns[i] = new ColumnInfo(ctx); + _columns[i] = new VectorWhiteningEstimator.ColumnInfo(ctx); _models = new float[ColumnPairs.Length][]; _invModels = new float[ColumnPairs.Length][]; @@ -293,9 +193,9 @@ internal static VectorWhiteningTransformer Create(IHostEnvironment env, ModelLoa } // Factory method for SignatureDataTransform. - internal static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input) + internal static IDataTransform Create(IHostEnvironment env, Options options, IDataView input) { - var infos = args.Columns.Select(colPair => new ColumnInfo(colPair, args)).ToArray(); + var infos = options.Columns.Select(colPair => new VectorWhiteningEstimator.ColumnInfo(colPair, options)).ToArray(); (var models, var invModels) = TrainVectorWhiteningTransform(env, input, infos); return new VectorWhiteningTransformer(env, models, invModels, infos).MakeDataTransform(input); } @@ -308,7 +208,7 @@ internal static IDataTransform Create(IHostEnvironment env, ModelLoadContext ctx internal static IRowMapper Create(IHostEnvironment env, ModelLoadContext ctx, Schema inputSchema) => Create(env, ctx).MakeRowMapper(inputSchema); - private static (string outputColumnName, string inputColumnName)[] GetColumnPairs(ColumnInfo[] columns) + private static (string outputColumnName, string inputColumnName)[] GetColumnPairs(VectorWhiteningEstimator.ColumnInfo[] columns) => columns.Select(c => (c.Name, c.InputColumnName ?? c.Name)).ToArray(); protected override void CheckInputColumn(Schema inputSchema, int col, int srcCol) @@ -345,7 +245,7 @@ private static void ValidateModel(IExceptionContext ectx, float[] model, ColumnT // Sometime GetRowCount doesn't really return the number of rows in the associated IDataView. // A more reliable solution is to turely iterate through all rows via a RowCursor. - private static long GetRowCount(IDataView inputData, params ColumnInfo[] columns) + private static long GetRowCount(IDataView inputData, params VectorWhiteningEstimator.ColumnInfo[] columns) { long? rows = inputData.GetRowCount(); if (rows != null) @@ -362,7 +262,7 @@ private static long GetRowCount(IDataView inputData, params ColumnInfo[] columns } // Computes the transformation matrices needed for whitening process from training data. - internal static (float[][] models, float[][] invModels) TrainVectorWhiteningTransform(IHostEnvironment env, IDataView inputData, params ColumnInfo[] columns) + internal static (float[][] models, float[][] invModels) TrainVectorWhiteningTransform(IHostEnvironment env, IDataView inputData, params VectorWhiteningEstimator.ColumnInfo[] columns) { var models = new float[columns.Length][]; var invModels = new float[columns.Length][]; @@ -378,7 +278,7 @@ internal static (float[][] models, float[][] invModels) TrainVectorWhiteningTran } // Extracts the indices and types of the input columns to the whitening transform. - private static void GetColTypesAndIndex(IHostEnvironment env, IDataView inputData, ColumnInfo[] columns, out ColumnType[] srcTypes, out int[] cols) + private static void GetColTypesAndIndex(IHostEnvironment env, IDataView inputData, VectorWhiteningEstimator.ColumnInfo[] columns, out ColumnType[] srcTypes, out int[] cols) { cols = new int[columns.Length]; srcTypes = new ColumnType[columns.Length]; @@ -400,7 +300,7 @@ private static void GetColTypesAndIndex(IHostEnvironment env, IDataView inputDat // Loads all relevant data for whitening training into memory. private static float[][] LoadDataAsDense(IHostEnvironment env, IChannel ch, IDataView inputData, out int[] actualRowCounts, - ColumnType[] srcTypes, int[] cols, params ColumnInfo[] columns) + ColumnType[] srcTypes, int[] cols, params VectorWhiteningEstimator.ColumnInfo[] columns) { long crowData = GetRowCount(inputData, columns); @@ -467,7 +367,7 @@ private static float[][] LoadDataAsDense(IHostEnvironment env, IChannel ch, IDat // will have dimension input_vec_size x input_vec_size. In the getter, the matrix will be truncated to only keep // PcaNum columns, and thus produce the desired output size. private static void TrainModels(IHostEnvironment env, IChannel ch, float[][] columnData, int[] rowCounts, - ref float[][] models, ref float[][] invModels, ColumnType[] srcTypes, params ColumnInfo[] columns) + ref float[][] models, ref float[][] invModels, ColumnType[] srcTypes, params VectorWhiteningEstimator.ColumnInfo[] columns) { ch.Assert(columnData.Length == rowCounts.Length); @@ -772,15 +672,143 @@ private static float DotProduct(float[] a, int aOffset, ReadOnlySpan b, R /// public sealed class VectorWhiteningEstimator : IEstimator { + [BestFriend] + internal static class Defaults + { + public const WhiteningKind Kind = WhiteningKind.Zca; + public const float Eps = 1e-5f; + public const int MaxRows = 100 * 1000; + public const bool SaveInverse = false; + public const int PcaNum = 0; + } + + /// + /// Describes how the transformer handles one column pair. + /// + public sealed class ColumnInfo + { + /// + /// Name of the column resulting from the transformation of . + /// + public readonly string Name; + /// + /// Name of column to transform. + /// + public readonly string InputColumnName; + /// + /// Whitening kind (PCA/ZCA). + /// + public readonly WhiteningKind Kind; + /// + /// Whitening constant, prevents division by zero. + /// + public readonly float Epsilon; + /// + /// Maximum number of rows used to train the transform. + /// + public readonly int MaxRow; + /// + /// In case of PCA whitening, indicates the number of components to retain. + /// + public readonly int PcaNum; + internal readonly bool SaveInv; + + /// + /// Describes how the transformer handles one input-output column pair. + /// + /// Name of the column resulting from the transformation of . + /// Name of column to transform. If set to , the value of the will be used as source. + /// Whitening kind (PCA/ZCA). + /// Whitening constant, prevents division by zero. + /// Maximum number of rows used to train the transform. + /// In case of PCA whitening, indicates the number of components to retain. + public ColumnInfo(string name, string inputColumnName = null, WhiteningKind kind = Defaults.Kind, float eps = Defaults.Eps, + int maxRows = Defaults.MaxRows, int pcaNum = Defaults.PcaNum) + { + Name = name; + Contracts.CheckValue(Name, nameof(Name)); + InputColumnName = inputColumnName ?? name; + Contracts.CheckValue(InputColumnName, nameof(InputColumnName)); + Kind = kind; + Contracts.CheckUserArg(Kind == WhiteningKind.Pca || Kind == WhiteningKind.Zca, nameof(Kind)); + Epsilon = eps; + Contracts.CheckUserArg(0 <= Epsilon && Epsilon < float.PositiveInfinity, nameof(Epsilon)); + MaxRow = maxRows; + Contracts.CheckUserArg(MaxRow > 0, nameof(MaxRow)); + SaveInv = Defaults.SaveInverse; + PcaNum = pcaNum; // REVIEW: make it work with pcaNum == 1. + Contracts.CheckUserArg(PcaNum >= 0, nameof(PcaNum)); + } + + internal ColumnInfo(VectorWhiteningTransformer.Column item, VectorWhiteningTransformer.Options options) + { + Name = item.Name; + Contracts.CheckValue(Name, nameof(Name)); + InputColumnName = item.Source ?? item.Name; + Contracts.CheckValue(InputColumnName, nameof(InputColumnName)); + Kind = item.Kind ?? options.Kind; + Contracts.CheckUserArg(Kind == WhiteningKind.Pca || Kind == WhiteningKind.Zca, nameof(item.Kind)); + Epsilon = item.Eps ?? options.Eps; + Contracts.CheckUserArg(0 <= Epsilon && Epsilon < float.PositiveInfinity, nameof(item.Eps)); + MaxRow = item.MaxRows ?? options.MaxRows; + Contracts.CheckUserArg(MaxRow > 0, nameof(item.MaxRows)); + SaveInv = item.SaveInverse ?? options.SaveInverse; + PcaNum = item.PcaNum ?? options.PcaNum; + Contracts.CheckUserArg(PcaNum >= 0, nameof(item.PcaNum)); + } + + internal ColumnInfo(ModelLoadContext ctx) + { + Contracts.AssertValue(ctx); + + // *** Binary format *** + // int: kind + // float: epsilon + // int: maxrow + // byte: saveInv + // int: pcaNum + Kind = (WhiteningKind)ctx.Reader.ReadInt32(); + Contracts.CheckDecode(Kind == WhiteningKind.Pca || Kind == WhiteningKind.Zca); + Epsilon = ctx.Reader.ReadFloat(); + Contracts.CheckDecode(0 <= Epsilon && Epsilon < float.PositiveInfinity); + MaxRow = ctx.Reader.ReadInt32(); + Contracts.CheckDecode(MaxRow > 0); + SaveInv = ctx.Reader.ReadBoolByte(); + PcaNum = ctx.Reader.ReadInt32(); + Contracts.CheckDecode(PcaNum >= 0); + } + + internal void Save(ModelSaveContext ctx) + { + Contracts.AssertValue(ctx); + + // *** Binary format *** + // int: kind + // float: epsilon + // int: maxrow + // byte: saveInv + // int: pcaNum + Contracts.Assert(Kind == WhiteningKind.Pca || Kind == WhiteningKind.Zca); + ctx.Writer.Write((int)Kind); + Contracts.Assert(0 <= Epsilon && Epsilon < float.PositiveInfinity); + ctx.Writer.Write(Epsilon); + Contracts.Assert(MaxRow > 0); + ctx.Writer.Write(MaxRow); + ctx.Writer.WriteBoolByte(SaveInv); + Contracts.Assert(PcaNum >= 0); + ctx.Writer.Write(PcaNum); + } + } + private readonly IHost _host; - private readonly VectorWhiteningTransformer.ColumnInfo[] _infos; + private readonly ColumnInfo[] _infos; /// /// The environment. /// Describes the parameters of the whitening process for each column pair. - public VectorWhiteningEstimator(IHostEnvironment env, params VectorWhiteningTransformer.ColumnInfo[] columns) + internal VectorWhiteningEstimator(IHostEnvironment env, params ColumnInfo[] columns) { - _host = Contracts.CheckRef(env, nameof(env)).Register(nameof(VectorWhiteningTransformer)); + _host = Contracts.CheckRef(env, nameof(env)).Register(nameof(VectorWhiteningEstimator)); _infos = columns; } @@ -792,15 +820,18 @@ public VectorWhiteningEstimator(IHostEnvironment env, params VectorWhiteningTran /// Whitening constant, prevents division by zero when scaling the data by inverse of eigenvalues. /// Maximum number of rows used to train the transform. /// In case of PCA whitening, indicates the number of components to retain. - public VectorWhiteningEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName = null, - WhiteningKind kind = VectorWhiteningTransformer.Defaults.Kind, - float eps = VectorWhiteningTransformer.Defaults.Eps, - int maxRows = VectorWhiteningTransformer.Defaults.MaxRows, - int pcaNum = VectorWhiteningTransformer.Defaults.PcaNum) - : this(env, new VectorWhiteningTransformer.ColumnInfo(outputColumnName, inputColumnName, kind, eps, maxRows, pcaNum)) + internal VectorWhiteningEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName = null, + WhiteningKind kind = Defaults.Kind, + float eps = Defaults.Eps, + int maxRows = Defaults.MaxRows, + int pcaNum = Defaults.PcaNum) + : this(env, new ColumnInfo(outputColumnName, inputColumnName, kind, eps, maxRows, pcaNum)) { } + /// + /// Trains and returns a . + /// public VectorWhiteningTransformer Fit(IDataView input) { // Build transformation matrices for whitening process, then construct a trained transform. @@ -809,7 +840,8 @@ public VectorWhiteningTransformer Fit(IDataView input) } /// - /// Returns the schema that would be produced by the transformation. + /// Returns the of the schema which will be produced by the transformer. + /// Used for schema propagation and verification in a pipeline. /// public SchemaShape GetOutputSchema(SchemaShape inputSchema) { diff --git a/src/Microsoft.ML.PCA/PCACatalog.cs b/src/Microsoft.ML.PCA/PCACatalog.cs index fca74a2524..d60bf7d1d0 100644 --- a/src/Microsoft.ML.PCA/PCACatalog.cs +++ b/src/Microsoft.ML.PCA/PCACatalog.cs @@ -33,7 +33,7 @@ public static PrincipalComponentAnalysisEstimator ProjectToPrincipalComponents(t /// Initializes a new instance of . /// The transform's catalog. /// Input columns to apply PrincipalComponentAnalysis on. - public static PrincipalComponentAnalysisEstimator ProjectToPrincipalComponents(this TransformsCatalog.ProjectionTransforms catalog, params PcaTransformer.ColumnInfo[] columns) + public static PrincipalComponentAnalysisEstimator ProjectToPrincipalComponents(this TransformsCatalog.ProjectionTransforms catalog, params PrincipalComponentAnalysisEstimator.ColumnInfo[] columns) => new PrincipalComponentAnalysisEstimator(CatalogUtils.GetEnvironment(catalog), columns); } } diff --git a/src/Microsoft.ML.PCA/PcaTransformer.cs b/src/Microsoft.ML.PCA/PcaTransformer.cs index 15918c4031..b6b134a1e0 100644 --- a/src/Microsoft.ML.PCA/PcaTransformer.cs +++ b/src/Microsoft.ML.PCA/PcaTransformer.cs @@ -17,26 +17,26 @@ using Microsoft.ML.Numeric; using Microsoft.ML.Transforms.Projections; -[assembly: LoadableClass(PcaTransformer.Summary, typeof(IDataTransform), typeof(PcaTransformer), typeof(PcaTransformer.Arguments), typeof(SignatureDataTransform), - PcaTransformer.UserName, PcaTransformer.LoaderSignature, PcaTransformer.ShortName)] +[assembly: LoadableClass(PrincipalComponentAnalysisTransformer.Summary, typeof(IDataTransform), typeof(PrincipalComponentAnalysisTransformer), typeof(PrincipalComponentAnalysisTransformer.Options), typeof(SignatureDataTransform), + PrincipalComponentAnalysisTransformer.UserName, PrincipalComponentAnalysisTransformer.LoaderSignature, PrincipalComponentAnalysisTransformer.ShortName)] -[assembly: LoadableClass(PcaTransformer.Summary, typeof(IDataTransform), typeof(PcaTransformer), null, typeof(SignatureLoadDataTransform), - PcaTransformer.UserName, PcaTransformer.LoaderSignature)] +[assembly: LoadableClass(PrincipalComponentAnalysisTransformer.Summary, typeof(IDataTransform), typeof(PrincipalComponentAnalysisTransformer), null, typeof(SignatureLoadDataTransform), + PrincipalComponentAnalysisTransformer.UserName, PrincipalComponentAnalysisTransformer.LoaderSignature)] -[assembly: LoadableClass(PcaTransformer.Summary, typeof(PcaTransformer), null, typeof(SignatureLoadModel), - PcaTransformer.UserName, PcaTransformer.LoaderSignature)] +[assembly: LoadableClass(PrincipalComponentAnalysisTransformer.Summary, typeof(PrincipalComponentAnalysisTransformer), null, typeof(SignatureLoadModel), + PrincipalComponentAnalysisTransformer.UserName, PrincipalComponentAnalysisTransformer.LoaderSignature)] -[assembly: LoadableClass(typeof(IRowMapper), typeof(PcaTransformer), null, typeof(SignatureLoadRowMapper), - PcaTransformer.UserName, PcaTransformer.LoaderSignature)] +[assembly: LoadableClass(typeof(IRowMapper), typeof(PrincipalComponentAnalysisTransformer), null, typeof(SignatureLoadRowMapper), + PrincipalComponentAnalysisTransformer.UserName, PrincipalComponentAnalysisTransformer.LoaderSignature)] -[assembly: LoadableClass(typeof(void), typeof(PcaTransformer), null, typeof(SignatureEntryPointModule), PcaTransformer.LoaderSignature)] +[assembly: LoadableClass(typeof(void), typeof(PrincipalComponentAnalysisTransformer), null, typeof(SignatureEntryPointModule), PrincipalComponentAnalysisTransformer.LoaderSignature)] namespace Microsoft.ML.Transforms.Projections { /// - public sealed class PcaTransformer : OneToOneTransformerBase + public sealed class PrincipalComponentAnalysisTransformer : OneToOneTransformerBase { - public sealed class Arguments : TransformInputBase + internal sealed class Options : TransformInputBase { [Argument(ArgumentType.Multiple | ArgumentType.Required, HelpText = "New column definition(s) (optional form: name:src)", Name = "Column", ShortName = "col", SortOrder = 1)] public Column[] Columns; @@ -57,7 +57,7 @@ public sealed class Arguments : TransformInputBase public int Seed = PrincipalComponentAnalysisEstimator.Defaults.Seed; } - public class Column : OneToOneColumn + internal class Column : OneToOneColumn { [Argument(ArgumentType.Multiple, HelpText = "The name of the weight column", ShortName = "weight")] public string WeightColumn; @@ -96,47 +96,6 @@ internal bool TryUnparse(StringBuilder sb) } } - public sealed class ColumnInfo - { - public readonly string Name; - public readonly string InputColumnName; - public readonly string WeightColumn; - public readonly int Rank; - public readonly int Oversampling; - public readonly bool Center; - public readonly int? Seed; - - /// - /// Describes how the transformer handles one column pair. - /// - /// Name of the column resulting from the transformation of . - /// Name of column to transform. - /// If set to , the value of the will be used as source. - /// The name of the weight column. - /// The number of components in the PCA. - /// Oversampling parameter for randomized PCA training. - /// If enabled, data is centered to be zero mean. - /// The seed for random number generation. - public ColumnInfo(string name, - string inputColumnName = null, - string weightColumn = PrincipalComponentAnalysisEstimator.Defaults.WeightColumn, - int rank = PrincipalComponentAnalysisEstimator.Defaults.Rank, - int overSampling = PrincipalComponentAnalysisEstimator.Defaults.Oversampling, - bool center = PrincipalComponentAnalysisEstimator.Defaults.Center, - int? seed = null) - { - Name = name; - InputColumnName = inputColumnName ?? name; - WeightColumn = weightColumn; - Rank = rank; - Oversampling = overSampling; - Center = center; - Seed = seed; - Contracts.CheckParam(Oversampling >= 0, nameof(Oversampling), "Oversampling must be non-negative."); - Contracts.CheckParam(Rank > 0, nameof(Rank), "Rank must be positive."); - } - } - private sealed class TransformInfo { public readonly int Dimension; @@ -224,7 +183,7 @@ public void ProjectMean(float[] mean) internal const string UserName = "Principal Component Analysis Transform"; internal const string ShortName = "Pca"; - public const string LoaderSignature = "PcaTransform"; + internal const string LoaderSignature = "PcaTransform"; private static VersionInfo GetVersionInfo() { return new VersionInfo( @@ -234,7 +193,7 @@ private static VersionInfo GetVersionInfo() verReadableCur: 0x00010002, verWeCanReadBack: 0x00010001, loaderSignature: LoaderSignature, - loaderAssemblyName: typeof(PcaTransformer).Assembly.FullName); + loaderAssemblyName: typeof(PrincipalComponentAnalysisTransformer).Assembly.FullName); } private readonly int _numColumns; @@ -243,8 +202,8 @@ private static VersionInfo GetVersionInfo() private const string RegistrationName = "Pca"; - internal PcaTransformer(IHostEnvironment env, IDataView input, ColumnInfo[] columns) - : base(Contracts.CheckRef(env, nameof(env)).Register(nameof(PcaTransformer)), GetColumnPairs(columns)) + internal PrincipalComponentAnalysisTransformer(IHostEnvironment env, IDataView input, PrincipalComponentAnalysisEstimator.ColumnInfo[] columns) + : base(Contracts.CheckRef(env, nameof(env)).Register(nameof(PrincipalComponentAnalysisTransformer)), GetColumnPairs(columns)) { Host.AssertNonEmpty(ColumnPairs); _numColumns = columns.Length; @@ -262,7 +221,7 @@ internal PcaTransformer(IHostEnvironment env, IDataView input, ColumnInfo[] colu Train(columns, _transformInfos, input); } - private PcaTransformer(IHost host, ModelLoadContext ctx) + private PrincipalComponentAnalysisTransformer(IHost host, ModelLoadContext ctx) : base(host, ctx) { Host.AssertValue(ctx); @@ -287,28 +246,28 @@ private static IRowMapper Create(IHostEnvironment env, ModelLoadContext ctx, Sch => Create(env, ctx).MakeRowMapper(inputSchema); // Factory method for SignatureDataTransform. - private static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input) + private static IDataTransform Create(IHostEnvironment env, Options options, IDataView input) { Contracts.CheckValue(env, nameof(env)); - env.CheckValue(args, nameof(args)); + env.CheckValue(options, nameof(options)); env.CheckValue(input, nameof(input)); - env.CheckValue(args.Columns, nameof(args.Columns)); - var cols = args.Columns.Select(item => new ColumnInfo( + env.CheckValue(options.Columns, nameof(options.Columns)); + var cols = options.Columns.Select(item => new PrincipalComponentAnalysisEstimator.ColumnInfo( item.Name, item.Source, item.WeightColumn, - item.Rank ?? args.Rank, - item.Oversampling ?? args.Oversampling, - item.Center ?? args.Center, - item.Seed ?? args.Seed)).ToArray(); - return new PcaTransformer(env, input, cols).MakeDataTransform(input); + item.Rank ?? options.Rank, + item.Oversampling ?? options.Oversampling, + item.Center ?? options.Center, + item.Seed ?? options.Seed)).ToArray(); + return new PrincipalComponentAnalysisTransformer(env, input, cols).MakeDataTransform(input); } // Factory method for SignatureLoadModel. - private static PcaTransformer Create(IHostEnvironment env, ModelLoadContext ctx) + private static PrincipalComponentAnalysisTransformer Create(IHostEnvironment env, ModelLoadContext ctx) { Contracts.CheckValue(env, nameof(env)); - var host = env.Register(nameof(PcaTransformer)); + var host = env.Register(nameof(PrincipalComponentAnalysisTransformer)); host.CheckValue(ctx, nameof(ctx)); ctx.CheckAtModel(GetVersionInfo()); @@ -317,7 +276,7 @@ private static PcaTransformer Create(IHostEnvironment env, ModelLoadContext ctx) int cbFloat = ctx.Reader.ReadInt32(); env.CheckDecode(cbFloat == sizeof(float)); } - return new PcaTransformer(host, ctx); + return new PrincipalComponentAnalysisTransformer(host, ctx); } public override void Save(ModelSaveContext ctx) @@ -333,13 +292,13 @@ public override void Save(ModelSaveContext ctx) for (int i = 0; i < _transformInfos.Length; i++) _transformInfos[i].Save(ctx); } - private static (string outputColumnName, string inputColumnName)[] GetColumnPairs(ColumnInfo[] columns) + private static (string outputColumnName, string inputColumnName)[] GetColumnPairs(PrincipalComponentAnalysisEstimator.ColumnInfo[] columns) { Contracts.CheckValue(columns, nameof(columns)); return columns.Select(x => (x.Name, x.InputColumnName)).ToArray(); } - private void Train(ColumnInfo[] columns, TransformInfo[] transformInfos, IDataView trainingData) + private void Train(PrincipalComponentAnalysisEstimator.ColumnInfo[] columns, TransformInfo[] transformInfos, IDataView trainingData) { var y = new float[_numColumns][][]; var omega = new float[_numColumns][][]; @@ -579,10 +538,10 @@ public ColumnSchemaInfo((string outputColumnName, string inputColumnName) column } } - private readonly PcaTransformer _parent; + private readonly PrincipalComponentAnalysisTransformer _parent; private readonly int _numColumns; - public Mapper(PcaTransformer parent, Schema inputSchema) + public Mapper(PrincipalComponentAnalysisTransformer parent, Schema inputSchema) : base(parent.Host.Register(nameof(Mapper)), parent, inputSchema) { _parent = parent; @@ -645,10 +604,10 @@ private static void TransformFeatures(IExceptionContext ectx, in VBuffer Desc = Summary, UserName = UserName, ShortName = ShortName)] - internal static CommonOutputs.TransformOutput Calculate(IHostEnvironment env, Arguments input) + internal static CommonOutputs.TransformOutput Calculate(IHostEnvironment env, Options input) { var h = EntryPointUtils.CheckArgsAndCreateHost(env, "Pca", input); - var view = PcaTransformer.Create(h, input, input.Data); + var view = PrincipalComponentAnalysisTransformer.Create(h, input, input.Data); return new CommonOutputs.TransformOutput() { Model = new TransformModelImpl(h, view, input.Data), @@ -658,7 +617,7 @@ internal static CommonOutputs.TransformOutput Calculate(IHostEnvironment env, Ar } /// - public sealed class PrincipalComponentAnalysisEstimator : IEstimator + public sealed class PrincipalComponentAnalysisEstimator : IEstimator { [BestFriend] internal static class Defaults @@ -670,8 +629,73 @@ internal static class Defaults public const int Seed = 0; } + /// + /// Describes how the transformer handles one column pair. + /// + public sealed class ColumnInfo + { + /// + /// Name of the column resulting from the transformation of . + /// + public readonly string Name; + /// + /// Name of column to transform. + /// + public readonly string InputColumnName; + /// + /// The name of the weight column. + /// + public readonly string WeightColumn; + /// + /// The number of components in the PCA. + /// + public readonly int Rank; + /// + /// Oversampling parameter for randomized PCA training. + /// + public readonly int Oversampling; + /// + /// If enabled, data is centered to be zero mean. + /// + public readonly bool Center; + /// + /// The seed for random number generation. + /// + public readonly int? Seed; + + /// + /// Describes how the transformer handles one column pair. + /// + /// Name of the column resulting from the transformation of . + /// Name of column to transform. + /// If set to , the value of the will be used as source. + /// The name of the weight column. + /// The number of components in the PCA. + /// Oversampling parameter for randomized PCA training. + /// If enabled, data is centered to be zero mean. + /// The random seed. If unspecified random state will be instead derived from the . + public ColumnInfo(string name, + string inputColumnName = null, + string weightColumn = Defaults.WeightColumn, + int rank = Defaults.Rank, + int overSampling = Defaults.Oversampling, + bool center = Defaults.Center, + int? seed = null) + { + Name = name; + InputColumnName = inputColumnName ?? name; + WeightColumn = weightColumn; + Rank = rank; + Oversampling = overSampling; + Center = center; + Seed = seed; + Contracts.CheckParam(Oversampling >= 0, nameof(Oversampling), "Oversampling must be non-negative."); + Contracts.CheckParam(Rank > 0, nameof(Rank), "Rank must be positive."); + } + } + private readonly IHost _host; - private readonly PcaTransformer.ColumnInfo[] _columns; + private readonly ColumnInfo[] _columns; /// /// The environment to use. @@ -683,28 +707,35 @@ internal static class Defaults /// Oversampling parameter for randomized PCA training. /// If enabled, data is centered to be zero mean. /// The seed for random number generation. - public PrincipalComponentAnalysisEstimator(IHostEnvironment env, + internal PrincipalComponentAnalysisEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName = null, string weightColumn = Defaults.WeightColumn, int rank = Defaults.Rank, int overSampling = Defaults.Oversampling, bool center = Defaults.Center, int? seed = null) - : this(env, new PcaTransformer.ColumnInfo(outputColumnName, inputColumnName ?? outputColumnName, weightColumn, rank, overSampling, center, seed)) + : this(env, new ColumnInfo(outputColumnName, inputColumnName ?? outputColumnName, weightColumn, rank, overSampling, center, seed)) { } /// /// The environment to use. /// The dataset columns to use, and their specific settings. - public PrincipalComponentAnalysisEstimator(IHostEnvironment env, params PcaTransformer.ColumnInfo[] columns) + internal PrincipalComponentAnalysisEstimator(IHostEnvironment env, params ColumnInfo[] columns) { Contracts.CheckValue(env, nameof(env)); _host = env.Register(nameof(PrincipalComponentAnalysisEstimator)); _columns = columns; } - public PcaTransformer Fit(IDataView input) => new PcaTransformer(_host, input, _columns); + /// + /// Trains and returns a . + /// + public PrincipalComponentAnalysisTransformer Fit(IDataView input) => new PrincipalComponentAnalysisTransformer(_host, input, _columns); + /// + /// Returns the of the schema which will be produced by the transformer. + /// Used for schema propagation and verification in a pipeline. + /// public SchemaShape GetOutputSchema(SchemaShape inputSchema) { _host.CheckValue(inputSchema, nameof(inputSchema)); diff --git a/src/Microsoft.ML.StandardLearners/Standard/SdcaBinary.cs b/src/Microsoft.ML.StandardLearners/Standard/SdcaBinary.cs index 9237f79abe..97f8b4b6fc 100644 --- a/src/Microsoft.ML.StandardLearners/Standard/SdcaBinary.cs +++ b/src/Microsoft.ML.StandardLearners/Standard/SdcaBinary.cs @@ -107,7 +107,7 @@ private protected RoleMappedData PrepareDataFromTrainingExamples(IChannel ch, Ro idvToFeedTrain = idvToShuffle; else { - var shuffleArgs = new RowShufflingTransformer.Arguments + var shuffleArgs = new RowShufflingTransformer.Options { PoolOnly = false, ForceShuffle = ShuffleData diff --git a/src/Microsoft.ML.StandardLearners/Standard/StochasticTrainerBase.cs b/src/Microsoft.ML.StandardLearners/Standard/StochasticTrainerBase.cs index 550939892f..a849baa509 100644 --- a/src/Microsoft.ML.StandardLearners/Standard/StochasticTrainerBase.cs +++ b/src/Microsoft.ML.StandardLearners/Standard/StochasticTrainerBase.cs @@ -73,7 +73,7 @@ private protected RoleMappedData PrepareDataFromTrainingExamples(IChannel ch, Ro idvToFeedTrain = idvToShuffle; else { - var shuffleArgs = new RowShufflingTransformer.Arguments + var shuffleArgs = new RowShufflingTransformer.Options { PoolOnly = false, ForceShuffle = ShuffleData diff --git a/src/Microsoft.ML.StaticPipe/TransformsStatic.cs b/src/Microsoft.ML.StaticPipe/TransformsStatic.cs index 10bee69c74..b1c15d3633 100644 --- a/src/Microsoft.ML.StaticPipe/TransformsStatic.cs +++ b/src/Microsoft.ML.StaticPipe/TransformsStatic.cs @@ -1595,11 +1595,11 @@ private sealed class Reconciler : EstimatorReconciler public override IEstimator Reconcile(IHostEnvironment env, PipelineColumn[] toOutput, IReadOnlyDictionary inputNames, IReadOnlyDictionary outputNames, IReadOnlyCollection usedNames) { - var infos = new RandomFourierFeaturizingTransformer.ColumnInfo[toOutput.Length]; + var infos = new RandomFourierFeaturizingEstimator.ColumnInfo[toOutput.Length]; for (int i = 0; i < toOutput.Length; ++i) { var tcol = (IColInput)toOutput[i]; - infos[i] = new RandomFourierFeaturizingTransformer.ColumnInfo(outputNames[toOutput[i]], tcol.Config.NewDim, tcol.Config.UseSin, inputNames[tcol.Input], tcol.Config.Generator, tcol.Config.Seed); + infos[i] = new RandomFourierFeaturizingEstimator.ColumnInfo(outputNames[toOutput[i]], tcol.Config.NewDim, tcol.Config.UseSin, inputNames[tcol.Input], tcol.Config.Generator, tcol.Config.Seed); } return new RandomFourierFeaturizingEstimator(env, infos); } @@ -1640,11 +1640,11 @@ public OutPipelineColumn(Vector input, string weightColumn, int rank, private sealed class Reconciler : EstimatorReconciler { - private readonly PcaTransformer.ColumnInfo _colInfo; + private readonly PrincipalComponentAnalysisEstimator.ColumnInfo _colInfo; public Reconciler(string weightColumn, int rank, int overSampling, bool center, int? seed = null) { - _colInfo = new PcaTransformer.ColumnInfo( + _colInfo = new PrincipalComponentAnalysisEstimator.ColumnInfo( null, null, weightColumn, rank, overSampling, center, seed); } diff --git a/src/Microsoft.ML.Transforms/GcnTransform.cs b/src/Microsoft.ML.Transforms/GcnTransform.cs index 3ae4156ec2..f48a4329b7 100644 --- a/src/Microsoft.ML.Transforms/GcnTransform.cs +++ b/src/Microsoft.ML.Transforms/GcnTransform.cs @@ -17,13 +17,13 @@ using Microsoft.ML.Model; using Microsoft.ML.Transforms.Projections; -[assembly: LoadableClass(LpNormalizingTransformer.GcnSummary, typeof(IDataTransform), typeof(LpNormalizingTransformer), typeof(LpNormalizingTransformer.GcnArguments), typeof(SignatureDataTransform), +[assembly: LoadableClass(LpNormalizingTransformer.GcnSummary, typeof(IDataTransform), typeof(LpNormalizingTransformer), typeof(LpNormalizingTransformer.GcnOptions), typeof(SignatureDataTransform), LpNormalizingTransformer.UserNameGn, "GcnTransform", LpNormalizingTransformer.ShortNameGn)] [assembly: LoadableClass(LpNormalizingTransformer.GcnSummary, typeof(IDataTransform), typeof(LpNormalizingTransformer), null, typeof(SignatureLoadDataTransform), LpNormalizingTransformer.UserNameGn, LpNormalizingTransformer.LoaderSignature, LpNormalizingTransformer.LoaderSignatureOld)] -[assembly: LoadableClass(LpNormalizingTransformer.Summary, typeof(IDataTransform), typeof(LpNormalizingTransformer), typeof(LpNormalizingTransformer.Arguments), typeof(SignatureDataTransform), +[assembly: LoadableClass(LpNormalizingTransformer.Summary, typeof(IDataTransform), typeof(LpNormalizingTransformer), typeof(LpNormalizingTransformer.Options), typeof(SignatureDataTransform), LpNormalizingTransformer.UserNameLP, "LpNormNormalizer", LpNormalizingTransformer.ShortNameLP)] [assembly: LoadableClass(LpNormalizingTransformer.Summary, typeof(LpNormalizingTransformer), null, typeof(SignatureLoadModel), @@ -51,7 +51,7 @@ namespace Microsoft.ML.Transforms.Projections /// public sealed class LpNormalizingTransformer : OneToOneTransformerBase { - public sealed class Arguments : TransformInputBase + internal sealed class Options : TransformInputBase { [Argument(ArgumentType.Multiple | ArgumentType.Required, HelpText = "New column definition(s) (optional form: name:src)", Name = "Column", ShortName = "col", SortOrder = 1)] public Column[] Columns; @@ -63,7 +63,7 @@ public sealed class Arguments : TransformInputBase public bool SubMean = LpNormalizingEstimatorBase.Defaults.LpSubstractMean; } - public sealed class GcnArguments : TransformInputBase + internal sealed class GcnOptions : TransformInputBase { [Argument(ArgumentType.Multiple, HelpText = "New column definition(s) (optional form: name:src)", Name = "Column", ShortName = "col", SortOrder = 1)] public GcnColumn[] Columns; @@ -78,7 +78,7 @@ public sealed class GcnArguments : TransformInputBase public float Scale = LpNormalizingEstimatorBase.Defaults.Scale; } - public abstract class ColumnBase : OneToOneColumn + internal abstract class ColumnBase : OneToOneColumn { [Argument(ArgumentType.AtMostOnce, HelpText = "Subtract mean from each value before normalizing")] public bool? SubMean; @@ -96,7 +96,7 @@ private protected override bool TryUnparseCore(StringBuilder sb) } } - public sealed class Column : ColumnBase + internal sealed class Column : ColumnBase { [Argument(ArgumentType.AtMostOnce, HelpText = "The norm to use to normalize each sample", ShortName = "norm", SortOrder = 1)] public LpNormalizingEstimatorBase.NormalizerKind? NormKind; @@ -120,7 +120,7 @@ internal bool TryUnparse(StringBuilder sb) } } - public sealed class GcnColumn : ColumnBase + internal sealed class GcnColumn : ColumnBase { [Argument(ArgumentType.AtMostOnce, HelpText = "Normalize by standard deviation rather than L2 norm")] public bool? UseStdDev; @@ -147,49 +147,7 @@ internal bool TryUnparse(StringBuilder sb) } } - /// - /// Describes how the transformer handles one Gcn column pair. - /// - public sealed class GcnColumnInfo : ColumnInfoBase - { - /// - /// Describes how the transformer handles one Gcn column pair. - /// - /// Name of the column resulting from the transformation of . - /// Name of column to transform. If set to , the value of the will be used as source. - /// Subtract mean from each value before normalizing. - /// Normalize by standard deviation rather than L2 norm. - /// Scale features by this value. - public GcnColumnInfo(string name, string inputColumnName = null, - bool substractMean = LpNormalizingEstimatorBase.Defaults.GcnSubstractMean, - bool useStdDev = LpNormalizingEstimatorBase.Defaults.UseStdDev, - float scale = LpNormalizingEstimatorBase.Defaults.Scale) - : base(name, inputColumnName, substractMean, useStdDev ? LpNormalizingEstimatorBase.NormalizerKind.StdDev : LpNormalizingEstimatorBase.NormalizerKind.L2Norm, scale) - { - } - } - - /// - /// Describes how the transformer handles one LpNorm column pair. - /// - public sealed class LpNormColumnInfo : ColumnInfoBase - { - /// - /// Describes how the transformer handles one LpNorm column pair. - /// - /// Name of the column resulting from the transformation of . - /// Name of column to transform. If set to , the value of the will be used as source. - /// Subtract mean from each value before normalizing. - /// The norm to use to normalize each sample. - public LpNormColumnInfo(string name, string inputColumnName = null, - bool substractMean = LpNormalizingEstimatorBase.Defaults.LpSubstractMean, - LpNormalizingEstimatorBase.NormalizerKind normalizerKind = LpNormalizingEstimatorBase.Defaults.NormKind) - : base(name, inputColumnName ?? name, substractMean, normalizerKind, 1) - { - } - } - - private sealed class ColumnInfoLoaded : ColumnInfoBase + private sealed class ColumnInfoLoaded : LpNormalizingEstimatorBase.ColumnInfoBase { internal ColumnInfoLoaded(ModelLoadContext ctx, string name, string inputColumnName, bool normKindSerialized) : base(ctx, name, inputColumnName, normKindSerialized) @@ -198,69 +156,6 @@ internal ColumnInfoLoaded(ModelLoadContext ctx, string name, string inputColumnN } } - /// - /// Describes base class for one column pair. - /// - public abstract class ColumnInfoBase - { - public readonly string Name; - public readonly string InputColumnName; - public readonly bool SubtractMean; - public readonly LpNormalizingEstimatorBase.NormalizerKind NormKind; - public readonly float Scale; - - internal ColumnInfoBase(string name, string inputColumnName, bool substractMean, LpNormalizingEstimatorBase.NormalizerKind normalizerKind, float scale) - { - Contracts.CheckNonWhiteSpace(name, nameof(name)); - Contracts.CheckNonWhiteSpace(inputColumnName, nameof(inputColumnName)); - Name = name; - InputColumnName = inputColumnName; - SubtractMean = substractMean; - Contracts.CheckUserArg(0 < scale && scale < float.PositiveInfinity, nameof(scale), "scale must be a positive finite value"); - Scale = scale; - NormKind = normalizerKind; - } - - internal ColumnInfoBase(ModelLoadContext ctx, string name, string inputColumnName, bool normKindSerialized) - { - Contracts.AssertValue(ctx); - Contracts.CheckNonWhiteSpace(inputColumnName, nameof(inputColumnName)); - Contracts.CheckNonWhiteSpace(name, nameof(name)); - Name = name; - InputColumnName = inputColumnName; - - // *** Binary format *** - // byte: SubtractMean - // byte: NormKind - // Float: Scale - SubtractMean = ctx.Reader.ReadBoolByte(); - byte normKindVal = ctx.Reader.ReadByte(); - Contracts.CheckDecode(Enum.IsDefined(typeof(LpNormalizingEstimatorBase.NormalizerKind), normKindVal)); - NormKind = (LpNormalizingEstimatorBase.NormalizerKind)normKindVal; - // Note: In early versions, a bool option (useStd) to whether to normalize by StdDev rather than - // L2 norm was used. normKind was added in version=verVectorNormalizerSupported. - // normKind was defined in a way such that the serialized boolean (0: use StdDev, 1: use L2) is - // still valid. - Contracts.CheckDecode(normKindSerialized || - (NormKind == LpNormalizingEstimatorBase.NormalizerKind.L2Norm || NormKind == LpNormalizingEstimatorBase.NormalizerKind.StdDev)); - Scale = ctx.Reader.ReadFloat(); - Contracts.CheckDecode(0 < Scale && Scale < float.PositiveInfinity); - } - - internal void Save(ModelSaveContext ctx) - { - Contracts.AssertValue(ctx); - // *** Binary format *** - // byte: SubtractMean - // byte: NormKind - // Float: Scale - ctx.Writer.WriteBoolByte(SubtractMean); - ctx.Writer.Write((byte)NormKind); - Contracts.Assert(0 < Scale && Scale < float.PositiveInfinity); - ctx.Writer.Write(Scale); - } - } - internal const string GcnSummary = "Performs a global contrast normalization on input values: Y = (s * X - M) / D, where s is a scale, M is mean and D is " + "either L2 norm or standard deviation."; internal const string UserNameGn = "Global Contrast Normalization Transform"; @@ -296,10 +191,13 @@ private static VersionInfo GetVersionInfo() // REVIEW: should this be an argument instead? private const float MinScale = 1e-8f; - public IReadOnlyCollection Columns => _columns.AsReadOnly(); - private readonly ColumnInfoBase[] _columns; + /// + /// The objects describing how the transformation is applied on the input data. + /// + public IReadOnlyCollection Columns => _columns.AsReadOnly(); + private readonly LpNormalizingEstimatorBase.ColumnInfoBase[] _columns; - private static (string outputColumnName, string inputColumnName)[] GetColumnPairs(ColumnInfoBase[] columns) + private static (string outputColumnName, string inputColumnName)[] GetColumnPairs(LpNormalizingEstimatorBase.ColumnInfoBase[] columns) { Contracts.CheckValue(columns, nameof(columns)); return columns.Select(x => (x.Name, x.InputColumnName)).ToArray(); @@ -314,58 +212,58 @@ protected override void CheckInputColumn(Schema inputSchema, int col, int srcCol /// /// Create a that takes multiple pairs of columns. /// - public LpNormalizingTransformer(IHostEnvironment env, params ColumnInfoBase[] columns) : + internal LpNormalizingTransformer(IHostEnvironment env, params LpNormalizingEstimatorBase.ColumnInfoBase[] columns) : base(Contracts.CheckRef(env, nameof(env)).Register(nameof(LpNormalizingTransformer)), GetColumnPairs(columns)) { _columns = columns.ToArray(); } // Factory method for SignatureDataTransform for GcnArguments class. - internal static IDataTransform Create(IHostEnvironment env, GcnArguments args, IDataView input) + internal static IDataTransform Create(IHostEnvironment env, GcnOptions options, IDataView input) { Contracts.CheckValue(env, nameof(env)); - env.CheckValue(args, nameof(args)); + env.CheckValue(options, nameof(options)); env.CheckValue(input, nameof(input)); - env.CheckValue(args.Columns, nameof(args.Columns)); - var cols = new GcnColumnInfo[args.Columns.Length]; + env.CheckValue(options.Columns, nameof(options.Columns)); + var cols = new GlobalContrastNormalizingEstimator.GcnColumnInfo[options.Columns.Length]; using (var ch = env.Start("ValidateArgs")) { for (int i = 0; i < cols.Length; i++) { - var item = args.Columns[i]; - cols[i] = new GcnColumnInfo( + var item = options.Columns[i]; + cols[i] = new GlobalContrastNormalizingEstimator.GcnColumnInfo( item.Name, item.Source ?? item.Name, - item.SubMean ?? args.SubMean, - item.UseStdDev ?? args.UseStdDev, - item.Scale ?? args.Scale); + item.SubMean ?? options.SubMean, + item.UseStdDev ?? options.UseStdDev, + item.Scale ?? options.Scale); } - if (!args.SubMean && args.UseStdDev) + if (!options.SubMean && options.UseStdDev) ch.Warning("subMean parameter is false while useStd is true. It is advisable to set subMean to true in case useStd is set to true."); } return new LpNormalizingTransformer(env, cols).MakeDataTransform(input); } // Factory method for SignatureDataTransform for Arguments class. - internal static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input) + internal static IDataTransform Create(IHostEnvironment env, Options options, IDataView input) { Contracts.CheckValue(env, nameof(env)); - env.CheckValue(args, nameof(args)); + env.CheckValue(options, nameof(options)); env.CheckValue(input, nameof(input)); - env.CheckValue(args.Columns, nameof(args.Columns)); - var cols = new LpNormColumnInfo[args.Columns.Length]; + env.CheckValue(options.Columns, nameof(options.Columns)); + var cols = new LpNormalizingEstimator.LpNormColumnInfo[options.Columns.Length]; using (var ch = env.Start("ValidateArgs")) { for (int i = 0; i < cols.Length; i++) { - var item = args.Columns[i]; - cols[i] = new LpNormColumnInfo( + var item = options.Columns[i]; + cols[i] = new LpNormalizingEstimator.LpNormColumnInfo( item.Name, item.Source ?? item.Name, - item.SubMean ?? args.SubMean, - item.NormKind ?? args.NormKind); + item.SubMean ?? options.SubMean, + item.NormKind ?? options.NormKind); } } return new LpNormalizingTransformer(env, cols).MakeDataTransform(input); @@ -717,7 +615,7 @@ internal static class LpNormalization Desc = LpNormalizingTransformer.Summary, UserName = LpNormalizingTransformer.UserNameLP, ShortName = LpNormalizingTransformer.ShortNameLP)] - public static CommonOutputs.TransformOutput Normalize(IHostEnvironment env, LpNormalizingTransformer.Arguments input) + public static CommonOutputs.TransformOutput Normalize(IHostEnvironment env, LpNormalizingTransformer.Options input) { var h = EntryPointUtils.CheckArgsAndCreateHost(env, "LpNormalize", input); var xf = LpNormalizingTransformer.Create(h, input, input.Data); @@ -732,7 +630,7 @@ public static CommonOutputs.TransformOutput Normalize(IHostEnvironment env, LpNo Desc = LpNormalizingTransformer.GcnSummary, UserName = LpNormalizingTransformer.UserNameGn, ShortName = LpNormalizingTransformer.ShortNameGn)] - public static CommonOutputs.TransformOutput GcNormalize(IHostEnvironment env, LpNormalizingTransformer.GcnArguments input) + public static CommonOutputs.TransformOutput GcNormalize(IHostEnvironment env, LpNormalizingTransformer.GcnOptions input) { var h = EntryPointUtils.CheckArgsAndCreateHost(env, "GcNormalize", input); var xf = LpNormalizingTransformer.Create(h, input, input.Data); @@ -760,6 +658,84 @@ public enum NormalizerKind : byte LInf = 3 } + /// + /// Describes base class for one column pair. + /// + public abstract class ColumnInfoBase + { + /// + /// Name of the column resulting from the transformation of . + /// + public readonly string Name; + /// + /// Name of column to transform. + /// + public readonly string InputColumnName; + /// + /// Subtract mean from each value before normalizing. + /// + public readonly bool SubtractMean; + /// + /// The norm to use to normalize each sample. + /// + public readonly NormalizerKind NormKind; + /// + /// Scale features by this value. + /// + public readonly float Scale; + + internal ColumnInfoBase(string name, string inputColumnName, bool substractMean, NormalizerKind normalizerKind, float scale) + { + Contracts.CheckNonWhiteSpace(name, nameof(name)); + Contracts.CheckNonWhiteSpace(inputColumnName, nameof(inputColumnName)); + Name = name; + InputColumnName = inputColumnName; + SubtractMean = substractMean; + Contracts.CheckUserArg(0 < scale && scale < float.PositiveInfinity, nameof(scale), "scale must be a positive finite value"); + Scale = scale; + NormKind = normalizerKind; + } + + internal ColumnInfoBase(ModelLoadContext ctx, string name, string inputColumnName, bool normKindSerialized) + { + Contracts.AssertValue(ctx); + Contracts.CheckNonWhiteSpace(inputColumnName, nameof(inputColumnName)); + Contracts.CheckNonWhiteSpace(name, nameof(name)); + Name = name; + InputColumnName = inputColumnName; + + // *** Binary format *** + // byte: SubtractMean + // byte: NormKind + // Float: Scale + SubtractMean = ctx.Reader.ReadBoolByte(); + byte normKindVal = ctx.Reader.ReadByte(); + Contracts.CheckDecode(Enum.IsDefined(typeof(NormalizerKind), normKindVal)); + NormKind = (NormalizerKind)normKindVal; + // Note: In early versions, a bool option (useStd) to whether to normalize by StdDev rather than + // L2 norm was used. normKind was added in version=verVectorNormalizerSupported. + // normKind was defined in a way such that the serialized boolean (0: use StdDev, 1: use L2) is + // still valid. + Contracts.CheckDecode(normKindSerialized || + (NormKind == NormalizerKind.L2Norm || NormKind == NormalizerKind.StdDev)); + Scale = ctx.Reader.ReadFloat(); + Contracts.CheckDecode(0 < Scale && Scale < float.PositiveInfinity); + } + + internal void Save(ModelSaveContext ctx) + { + Contracts.AssertValue(ctx); + // *** Binary format *** + // byte: SubtractMean + // byte: NormKind + // Float: Scale + ctx.Writer.WriteBoolByte(SubtractMean); + ctx.Writer.Write((byte)NormKind); + Contracts.Assert(0 < Scale && Scale < float.PositiveInfinity); + ctx.Writer.Write(Scale); + } + } + [BestFriend] internal static class Defaults { @@ -773,10 +749,9 @@ internal static class Defaults /// /// Create a that takes multiple pairs of columns. /// - public LpNormalizingEstimatorBase(IHostEnvironment env, params LpNormalizingTransformer.ColumnInfoBase[] columns) + internal LpNormalizingEstimatorBase(IHostEnvironment env, params ColumnInfoBase[] columns) : base(Contracts.CheckRef(env, nameof(env)).Register(nameof(LpNormalizingEstimator)), new LpNormalizingTransformer(env, columns)) { - } internal static bool IsColumnTypeValid(ColumnType type) @@ -795,6 +770,10 @@ internal static bool IsSchemaColumnValid(SchemaShape.Column col) internal const string ExpectedColumnType = "Expected float or float vector of known size"; + /// + /// Returns the of the schema which will be produced by the transformer. + /// Used for schema propagation and verification in a pipeline. + /// public override SchemaShape GetOutputSchema(SchemaShape inputSchema) { Host.CheckValue(inputSchema, nameof(inputSchema)); @@ -816,17 +795,36 @@ public override SchemaShape GetOutputSchema(SchemaShape inputSchema) } /// - /// Lp Normalizing estimator allow you take columns and normalize them individually by rescaling them to unit norm. + /// Lp Normalizing estimator takes columns and normalizes them individually by rescaling them to unit norm. /// public sealed class LpNormalizingEstimator : LpNormalizingEstimatorBase { + /// + /// Describes how the transformer handles one column pair. + /// + public sealed class LpNormColumnInfo : ColumnInfoBase + { + /// + /// Describes how the transformer handles one column pair. + /// + /// Name of the column resulting from the transformation of . + /// Name of column to transform. If set to , the value of the will be used as source. + /// Subtract mean from each value before normalizing. + /// The norm to use to normalize each sample. + public LpNormColumnInfo(string name, string inputColumnName = null, + bool substractMean = Defaults.LpSubstractMean, + NormalizerKind normalizerKind = Defaults.NormKind) + : base(name, inputColumnName ?? name, substractMean, normalizerKind, 1) + { + } + } /// /// The environment. /// Name of the column resulting from the transformation of . /// Name of the column to transform. If set to , the value of the will be used as source. /// Type of norm to use to normalize each sample. /// Subtract mean from each value before normalizing. - public LpNormalizingEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName = null, + internal LpNormalizingEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName = null, NormalizerKind normKind = Defaults.NormKind, bool substractMean = Defaults.LpSubstractMean) : this(env, new[] { (outputColumnName, inputColumnName ?? outputColumnName) }, normKind, substractMean) { @@ -837,26 +835,48 @@ public LpNormalizingEstimator(IHostEnvironment env, string outputColumnName, str /// Pairs of columns to run the normalization on. /// Type of norm to use to normalize each sample. /// Subtract mean from each value before normalizing. - public LpNormalizingEstimator(IHostEnvironment env, (string outputColumnName, string inputColumnName)[] columns, + internal LpNormalizingEstimator(IHostEnvironment env, (string outputColumnName, string inputColumnName)[] columns, NormalizerKind normKind = Defaults.NormKind, bool substractMean = Defaults.LpSubstractMean) - : this(env, columns.Select(x => new LpNormalizingTransformer.LpNormColumnInfo(x.outputColumnName, x.inputColumnName, substractMean, normKind)).ToArray()) + : this(env, columns.Select(x => new LpNormColumnInfo(x.outputColumnName, x.inputColumnName, substractMean, normKind)).ToArray()) { } /// /// Create a that takes multiple pairs of columns. /// - public LpNormalizingEstimator(IHostEnvironment env, params LpNormalizingTransformer.LpNormColumnInfo[] columns) + internal LpNormalizingEstimator(IHostEnvironment env, params LpNormColumnInfo[] columns) : base(env, columns) { } } /// - /// Global contrast normalizing estimator allow you take columns and performs global constrast normalization on them. + /// Global contrast normalizing estimator takes columns and performs global constrast normalization. /// public sealed class GlobalContrastNormalizingEstimator : LpNormalizingEstimatorBase { + /// + /// Describes how the transformer handles one Gcn column pair. + /// + public sealed class GcnColumnInfo : ColumnInfoBase + { + /// + /// Describes how the transformer handles one Gcn column pair. + /// + /// Name of the column resulting from the transformation of . + /// Name of column to transform. If set to , the value of the will be used as source. + /// Subtract mean from each value before normalizing. + /// Normalize by standard deviation rather than L2 norm. + /// Scale features by this value. + public GcnColumnInfo(string name, string inputColumnName = null, + bool substractMean = Defaults.GcnSubstractMean, + bool useStdDev = Defaults.UseStdDev, + float scale = Defaults.Scale) + : base(name, inputColumnName, substractMean, useStdDev ? NormalizerKind.StdDev : NormalizerKind.L2Norm, scale) + { + } + } + /// /// The environment. /// Name of the column resulting from the transformation of . @@ -864,7 +884,7 @@ public sealed class GlobalContrastNormalizingEstimator : LpNormalizingEstimatorB /// Subtract mean from each value before normalizing. /// Normalize by standard deviation rather than L2 norm. /// Scale features by this value. - public GlobalContrastNormalizingEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName = null, + internal GlobalContrastNormalizingEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName = null, bool substractMean = Defaults.GcnSubstractMean, bool useStdDev = Defaults.UseStdDev, float scale = Defaults.Scale) : this(env, new[] { (outputColumnName, inputColumnName ?? outputColumnName) }, substractMean, useStdDev, scale) { @@ -876,16 +896,16 @@ public GlobalContrastNormalizingEstimator(IHostEnvironment env, string outputCol /// Subtract mean from each value before normalizing. /// Normalize by standard deviation rather than L2 norm. /// Scale features by this value. - public GlobalContrastNormalizingEstimator(IHostEnvironment env, (string outputColumnName, string inputColumnName)[] columns, + internal GlobalContrastNormalizingEstimator(IHostEnvironment env, (string outputColumnName, string inputColumnName)[] columns, bool substractMean = Defaults.GcnSubstractMean, bool useStdDev = Defaults.UseStdDev, float scale = Defaults.Scale) - : this(env, columns.Select(x => new LpNormalizingTransformer.GcnColumnInfo(x.outputColumnName, x.inputColumnName, substractMean, useStdDev, scale)).ToArray()) + : this(env, columns.Select(x => new GcnColumnInfo(x.outputColumnName, x.inputColumnName, substractMean, useStdDev, scale)).ToArray()) { } /// /// Create a that takes multiple pairs of columns. /// - public GlobalContrastNormalizingEstimator(IHostEnvironment env, params LpNormalizingTransformer.GcnColumnInfo[] columns) : + internal GlobalContrastNormalizingEstimator(IHostEnvironment env, params GcnColumnInfo[] columns) : base(env, columns) { } diff --git a/src/Microsoft.ML.Transforms/LearnerFeatureSelection.cs b/src/Microsoft.ML.Transforms/LearnerFeatureSelection.cs index 405bd1155d..3c66d9d06d 100644 --- a/src/Microsoft.ML.Transforms/LearnerFeatureSelection.cs +++ b/src/Microsoft.ML.Transforms/LearnerFeatureSelection.cs @@ -13,7 +13,7 @@ using Microsoft.ML.Transforms; using Microsoft.ML.Transforms.FeatureSelection; -[assembly: LoadableClass(LearnerFeatureSelectionTransform.Summary, typeof(IDataTransform), typeof(LearnerFeatureSelectionTransform), typeof(LearnerFeatureSelectionTransform.Arguments), typeof(SignatureDataTransform), +[assembly: LoadableClass(LearnerFeatureSelectionTransform.Summary, typeof(IDataTransform), typeof(LearnerFeatureSelectionTransform), typeof(LearnerFeatureSelectionTransform.Options), typeof(SignatureDataTransform), "Learner Feature Selection Transform", "LearnerFeatureSelectionTransform", "LearnerFeatureSelection")] namespace Microsoft.ML.Transforms @@ -28,7 +28,7 @@ internal static class LearnerFeatureSelectionTransform internal const string Summary = "Selects the slots for which the absolute value of the corresponding weight in a linear learner is greater than a threshold."; #pragma warning disable CS0649 // The fields will still be set via the reflection driven mechanisms. - public sealed class Arguments + public sealed class Options { [Argument(ArgumentType.LastOccurenceWins, HelpText = "If the corresponding absolute value of the weight for a slot is greater than this threshold, the slot is preserved", ShortName = "ft", SortOrder = 2)] public Single? Threshold; @@ -85,21 +85,21 @@ internal void Check(IExceptionContext ectx) internal static string RegistrationName = "LearnerFeatureSelectionTransform"; // Factory method for SignatureDataTransform. - private static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input) + private static IDataTransform Create(IHostEnvironment env, Options options, IDataView input) { Contracts.CheckValue(env, nameof(env)); var host = env.Register(RegistrationName); - host.CheckValue(args, nameof(args)); + host.CheckValue(options, nameof(options)); host.CheckValue(input, nameof(input)); - args.Check(host); + options.Check(host); var scores = default(VBuffer); - TrainCore(host, input, args, ref scores); + TrainCore(host, input, options, ref scores); using (var ch = host.Start("Dropping Slots")) { int selectedCount; - var column = CreateDropSlotsColumn(args, in scores, out selectedCount); + var column = CreateDropSlotsColumn(options, in scores, out selectedCount); if (column == null) { @@ -107,39 +107,39 @@ private static IDataTransform Create(IHostEnvironment env, Arguments args, IData return NopTransform.CreateIfNeeded(host, input); } - ch.Info(MessageSensitivity.Schema, "Selected {0} slots out of {1} in column '{2}'", selectedCount, scores.Length, args.FeatureColumn); + ch.Info(MessageSensitivity.Schema, "Selected {0} slots out of {1} in column '{2}'", selectedCount, scores.Length, options.FeatureColumn); return new SlotsDroppingTransformer(host, column).Transform(input) as IDataTransform; } } - private static SlotsDroppingTransformer.ColumnInfo CreateDropSlotsColumn(Arguments args, in VBuffer scores, out int selectedCount) + private static SlotsDroppingTransformer.ColumnInfo CreateDropSlotsColumn(Options options, in VBuffer scores, out int selectedCount) { // Not checking the scores.Length, because: // 1. If it's the same as the features column length, we should be constructing the right DropSlots arguments. // 2. If it's less, we assume that the rest of the scores are zero and we drop the slots. // 3. If it's greater, the drop slots ignores the ranges that are outside the valid range of indices for the column. - Contracts.Assert(args.Threshold.HasValue != args.NumSlotsToKeep.HasValue); + Contracts.Assert(options.Threshold.HasValue != options.NumSlotsToKeep.HasValue); var col = new SlotsDroppingTransformer.Column(); - col.Source = args.FeatureColumn; + col.Source = options.FeatureColumn; selectedCount = 0; var scoresValues = scores.GetValues(); // Degenerate case, dropping all slots. if (scoresValues.Length == 0) - return new SlotsDroppingTransformer.ColumnInfo(args.FeatureColumn); + return new SlotsDroppingTransformer.ColumnInfo(options.FeatureColumn); int tiedScoresToKeep; float threshold; - if (args.Threshold.HasValue) + if (options.Threshold.HasValue) { - threshold = args.Threshold.Value; + threshold = options.Threshold.Value; tiedScoresToKeep = threshold > 0 ? int.MaxValue : 0; } else { - Contracts.Assert(args.NumSlotsToKeep.HasValue); - threshold = ComputeThreshold(scoresValues, args.NumSlotsToKeep.Value, out tiedScoresToKeep); + Contracts.Assert(options.NumSlotsToKeep.HasValue); + threshold = ComputeThreshold(scoresValues, options.NumSlotsToKeep.Value, out tiedScoresToKeep); } var slots = new List<(int min, int? max)>(); @@ -224,7 +224,7 @@ private static SlotsDroppingTransformer.ColumnInfo CreateDropSlotsColumn(Argumen } if (slots.Count > 0) - return new SlotsDroppingTransformer.ColumnInfo(args.FeatureColumn, slots: slots.ToArray()); + return new SlotsDroppingTransformer.ColumnInfo(options.FeatureColumn, slots: slots.ToArray()); return null; } @@ -264,36 +264,36 @@ private static float ComputeThreshold(ReadOnlySpan scores, int topk, out return threshold; } - private static void TrainCore(IHost host, IDataView input, Arguments args, ref VBuffer scores) + private static void TrainCore(IHost host, IDataView input, Options options, ref VBuffer scores) { Contracts.AssertValue(host); - host.AssertValue(args); + host.AssertValue(options); host.AssertValue(input); - host.Assert(args.Threshold.HasValue != args.NumSlotsToKeep.HasValue); + host.Assert(options.Threshold.HasValue != options.NumSlotsToKeep.HasValue); using (var ch = host.Start("Train")) { ch.Trace("Constructing trainer"); - ITrainer trainer = args.Filter.CreateComponent(host); + ITrainer trainer = options.Filter.CreateComponent(host); IDataView view = input; var schema = view.Schema; - var label = TrainUtils.MatchNameOrDefaultOrNull(ch, schema, nameof(args.LabelColumn), args.LabelColumn, DefaultColumnNames.Label); - var feature = TrainUtils.MatchNameOrDefaultOrNull(ch, schema, nameof(args.FeatureColumn), args.FeatureColumn, DefaultColumnNames.Features); - var group = TrainUtils.MatchNameOrDefaultOrNull(ch, schema, nameof(args.GroupColumn), args.GroupColumn, DefaultColumnNames.GroupId); - var weight = TrainUtils.MatchNameOrDefaultOrNull(ch, schema, nameof(args.WeightColumn), args.WeightColumn, DefaultColumnNames.Weight); - var name = TrainUtils.MatchNameOrDefaultOrNull(ch, schema, nameof(args.NameColumn), args.NameColumn, DefaultColumnNames.Name); + var label = TrainUtils.MatchNameOrDefaultOrNull(ch, schema, nameof(options.LabelColumn), options.LabelColumn, DefaultColumnNames.Label); + var feature = TrainUtils.MatchNameOrDefaultOrNull(ch, schema, nameof(options.FeatureColumn), options.FeatureColumn, DefaultColumnNames.Features); + var group = TrainUtils.MatchNameOrDefaultOrNull(ch, schema, nameof(options.GroupColumn), options.GroupColumn, DefaultColumnNames.GroupId); + var weight = TrainUtils.MatchNameOrDefaultOrNull(ch, schema, nameof(options.WeightColumn), options.WeightColumn, DefaultColumnNames.Weight); + var name = TrainUtils.MatchNameOrDefaultOrNull(ch, schema, nameof(options.NameColumn), options.NameColumn, DefaultColumnNames.Name); - TrainUtils.AddNormalizerIfNeeded(host, ch, trainer, ref view, feature, args.NormalizeFeatures); + TrainUtils.AddNormalizerIfNeeded(host, ch, trainer, ref view, feature, options.NormalizeFeatures); ch.Trace("Binding columns"); - var customCols = TrainUtils.CheckAndGenerateCustomColumns(ch, args.CustomColumns); + var customCols = TrainUtils.CheckAndGenerateCustomColumns(ch, options.CustomColumns); var data = new RoleMappedData(view, label, feature, group, weight, name, customCols); var predictor = TrainUtils.Train(host, ch, data, trainer, null, - null, 0, args.CacheData); + null, 0, options.CacheData); var rfs = predictor as IPredictorWithFeatureWeights; Contracts.AssertValue(rfs); @@ -304,15 +304,15 @@ private static void TrainCore(IHost host, IDataView input, Arguments args, ref V /// /// Returns a score for each slot of the features column. /// - public static void Train(IHostEnvironment env, IDataView input, Arguments args, ref VBuffer scores) + public static void Train(IHostEnvironment env, IDataView input, Options options, ref VBuffer scores) { Contracts.CheckValue(env, nameof(env)); var host = env.Register(RegistrationName); - host.CheckValue(args, nameof(args)); + host.CheckValue(options, nameof(options)); host.CheckValue(input, nameof(input)); - args.Check(host); + options.Check(host); - TrainCore(host, input, args, ref scores); + TrainCore(host, input, options, ref scores); } } } diff --git a/src/Microsoft.ML.Transforms/ProjectionCatalog.cs b/src/Microsoft.ML.Transforms/ProjectionCatalog.cs index 5ef9670f25..1da466ec24 100644 --- a/src/Microsoft.ML.Transforms/ProjectionCatalog.cs +++ b/src/Microsoft.ML.Transforms/ProjectionCatalog.cs @@ -36,7 +36,7 @@ public static RandomFourierFeaturizingEstimator CreateRandomFourierFeatures(this /// /// The transform's catalog. /// The input columns to use for the transformation. - public static RandomFourierFeaturizingEstimator CreateRandomFourierFeatures(this TransformsCatalog.ProjectionTransforms catalog, params RandomFourierFeaturizingTransformer.ColumnInfo[] columns) + public static RandomFourierFeaturizingEstimator CreateRandomFourierFeatures(this TransformsCatalog.ProjectionTransforms catalog, params RandomFourierFeaturizingEstimator.ColumnInfo[] columns) => new RandomFourierFeaturizingEstimator(CatalogUtils.GetEnvironment(catalog), columns); /// @@ -63,7 +63,7 @@ public static LpNormalizingEstimator LpNormalize(this TransformsCatalog.Projecti /// /// The transform's catalog. /// Describes the parameters of the lp-normalization process for each column pair. - public static LpNormalizingEstimator LpNormalize(this TransformsCatalog.ProjectionTransforms catalog, params LpNormalizingTransformer.LpNormColumnInfo[] columns) + public static LpNormalizingEstimator LpNormalize(this TransformsCatalog.ProjectionTransforms catalog, params LpNormalizingEstimator.LpNormColumnInfo[] columns) => new LpNormalizingEstimator(CatalogUtils.GetEnvironment(catalog), columns); /// @@ -93,7 +93,7 @@ public static GlobalContrastNormalizingEstimator GlobalContrastNormalize(this Tr /// /// The transform's catalog. /// Describes the parameters of the gcn-normaliztion process for each column pair. - public static GlobalContrastNormalizingEstimator GlobalContrastNormalize(this TransformsCatalog.ProjectionTransforms catalog, params LpNormalizingTransformer.GcnColumnInfo[] columns) + public static GlobalContrastNormalizingEstimator GlobalContrastNormalize(this TransformsCatalog.ProjectionTransforms catalog, params GlobalContrastNormalizingEstimator.GcnColumnInfo[] columns) => new GlobalContrastNormalizingEstimator(CatalogUtils.GetEnvironment(catalog), columns); } } diff --git a/src/Microsoft.ML.Transforms/RandomFourierFeaturizing.cs b/src/Microsoft.ML.Transforms/RandomFourierFeaturizing.cs index 6ba025abb0..754ac9b573 100644 --- a/src/Microsoft.ML.Transforms/RandomFourierFeaturizing.cs +++ b/src/Microsoft.ML.Transforms/RandomFourierFeaturizing.cs @@ -17,7 +17,7 @@ using Microsoft.ML.Numeric; using Microsoft.ML.Transforms.Projections; -[assembly: LoadableClass(RandomFourierFeaturizingTransformer.Summary, typeof(IDataTransform), typeof(RandomFourierFeaturizingTransformer), typeof(RandomFourierFeaturizingTransformer.Arguments), typeof(SignatureDataTransform), +[assembly: LoadableClass(RandomFourierFeaturizingTransformer.Summary, typeof(IDataTransform), typeof(RandomFourierFeaturizingTransformer), typeof(RandomFourierFeaturizingTransformer.Options), typeof(SignatureDataTransform), "Random Fourier Features Transform", "RffTransform", "Rff")] [assembly: LoadableClass(RandomFourierFeaturizingTransformer.Summary, typeof(IDataTransform), typeof(RandomFourierFeaturizingTransformer), null, typeof(SignatureLoadDataTransform), @@ -31,9 +31,12 @@ namespace Microsoft.ML.Transforms.Projections { + /// + /// Maps vector columns to a low -dimensional feature space. + /// public sealed class RandomFourierFeaturizingTransformer : OneToOneTransformerBase { - public sealed class Arguments + internal sealed class Options { [Argument(ArgumentType.Multiple | ArgumentType.Required, HelpText = "New column definition(s) (optional form: name:src)", Name = "Column", ShortName = "col", SortOrder = 1)] public Column[] Columns; @@ -52,7 +55,7 @@ public sealed class Arguments public int? Seed; } - public sealed class Column : OneToOneColumn + internal sealed class Column : OneToOneColumn { [Argument(ArgumentType.AtMostOnce, HelpText = "The number of random Fourier features to create", ShortName = "dim")] public int? NewDim; @@ -103,7 +106,7 @@ private sealed class TransformInfo private readonly TauswortheHybrid _rand; private readonly TauswortheHybrid.State _state; - public TransformInfo(IHost host, ColumnInfo column, int d, float avgDist) + public TransformInfo(IHost host, RandomFourierFeaturizingEstimator.ColumnInfo column, int d, float avgDist) { Contracts.AssertValue(host); @@ -207,7 +210,7 @@ private void InitializeFourierCoefficients(int rowSize, int colSize) + "since the transform is designed so that the inner products of the transformed data are approximately equal to those in the feature space of a user specified " + "shift-invariant kernel."; - public const string LoaderSignature = "RffTransform"; + internal const string LoaderSignature = "RffTransform"; private static VersionInfo GetVersionInfo() { @@ -232,37 +235,7 @@ private static string TestColumnType(ColumnType type) return "Expected vector of floats with known size"; } - public sealed class ColumnInfo - { - public readonly string Name; - public readonly string InputColumnName; - public readonly IComponentFactory Generator; - public readonly int NewDim; - public readonly bool UseSin; - public readonly int? Seed; - - /// - /// Describes how the transformer handles one column pair. - /// - /// Name of the column resulting from the transformation of . - /// The number of random Fourier features to create. - /// Create two features for every random Fourier frequency? (one for cos and one for sin). - /// Name of column to transform. - /// Which fourier generator to use. - /// The seed of the random number generator for generating the new features (if unspecified, the global random is used. - public ColumnInfo(string name, int newDim, bool useSin, string inputColumnName = null, IComponentFactory generator = null, int? seed = null) - { - Contracts.CheckUserArg(newDim > 0, nameof(newDim), "must be positive."); - InputColumnName = inputColumnName ?? name; - Name = name; - Generator = generator ?? new GaussianFourierSampler.Arguments(); - NewDim = newDim; - UseSin = useSin; - Seed = seed; - } - } - - private static (string outputColumnName, string inputColumnName)[] GetColumnPairs(ColumnInfo[] columns) + private static (string outputColumnName, string inputColumnName)[] GetColumnPairs(RandomFourierFeaturizingEstimator.ColumnInfo[] columns) { Contracts.CheckValue(columns, nameof(columns)); return columns.Select(x => (x.Name, x.InputColumnName)).ToArray(); @@ -279,7 +252,7 @@ protected override void CheckInputColumn(Schema inputSchema, int col, int srcCol new VectorType(NumberType.Float, _transformInfos[col].SrcDim).ToString(), type.ToString()); } - public RandomFourierFeaturizingTransformer(IHostEnvironment env, IDataView input, ColumnInfo[] columns) + internal RandomFourierFeaturizingTransformer(IHostEnvironment env, IDataView input, RandomFourierFeaturizingEstimator.ColumnInfo[] columns) : base(Contracts.CheckRef(env, nameof(env)).Register(nameof(RandomFourierFeaturizingTransformer)), GetColumnPairs(columns)) { var avgDistances = GetAvgDistances(columns, input); @@ -305,7 +278,7 @@ private static int RoundUp(int cflt, int cfltAlign) return cblob * cfltAlign; } - private float[] GetAvgDistances(ColumnInfo[] columns, IDataView input) + private float[] GetAvgDistances(RandomFourierFeaturizingEstimator.ColumnInfo[] columns, IDataView input) { var avgDistances = new float[columns.Length]; const int reservoirSize = 5000; @@ -448,27 +421,27 @@ private RandomFourierFeaturizingTransformer(IHost host, ModelLoadContext ctx) } // Factory method for SignatureDataTransform. - private static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input) + private static IDataTransform Create(IHostEnvironment env, Options options, IDataView input) { Contracts.CheckValue(env, nameof(env)); - env.CheckValue(args, nameof(args)); + env.CheckValue(options, nameof(options)); env.CheckValue(input, nameof(input)); - env.CheckValue(args.Columns, nameof(args.Columns)); - var cols = new ColumnInfo[args.Columns.Length]; + env.CheckValue(options.Columns, nameof(options.Columns)); + var cols = new RandomFourierFeaturizingEstimator.ColumnInfo[options.Columns.Length]; using (var ch = env.Start("ValidateArgs")) { for (int i = 0; i < cols.Length; i++) { - var item = args.Columns[i]; - cols[i] = new ColumnInfo( + var item = options.Columns[i]; + cols[i] = new RandomFourierFeaturizingEstimator.ColumnInfo( item.Name, - item.NewDim ?? args.NewDim, - item.UseSin ?? args.UseSin, + item.NewDim ?? options.NewDim, + item.UseSin ?? options.UseSin, item.Source ?? item.Name, - item.MatrixGenerator ?? args.MatrixGenerator, - item.Seed ?? args.Seed); + item.MatrixGenerator ?? options.MatrixGenerator, + item.Seed ?? options.Seed); }; } return new RandomFourierFeaturizingTransformer(env, input, cols).MakeDataTransform(input); @@ -639,7 +612,7 @@ private void TransformFeatures(in VBuffer src, ref VBuffer dst, Tr } /// - /// Estimator which takes set of vector columns and maps its input to a random low-dimensional feature space. + /// Maps vector columns to a low -dimensional feature space. /// public sealed class RandomFourierFeaturizingEstimator : IEstimator { @@ -650,31 +623,89 @@ internal static class Defaults public const bool UseSin = false; } + /// + /// Describes how the transformer handles one Gcn column pair. + /// + public sealed class ColumnInfo + { + /// + /// Name of the column resulting from the transformation of . + /// + public readonly string Name; + /// + /// Name of the column to transform. + /// + public readonly string InputColumnName; + /// + /// Which fourier generator to use. + /// + public readonly IComponentFactory Generator; + /// + /// The number of random Fourier features to create. + /// + public readonly int NewDim; + /// + /// Create two features for every random Fourier frequency? (one for cos and one for sin). + /// + public readonly bool UseSin; + /// + /// The seed of the random number generator for generating the new features (if unspecified, the global random is used). + /// + public readonly int? Seed; + + /// + /// Describes how the transformer handles one column pair. + /// + /// Name of the column resulting from the transformation of . + /// The number of random Fourier features to create. + /// Create two features for every random Fourier frequency? (one for cos and one for sin). + /// Name of column to transform. + /// Which fourier generator to use. + /// The seed of the random number generator for generating the new features (if unspecified, the global random is used). + public ColumnInfo(string name, int newDim, bool useSin, string inputColumnName = null, IComponentFactory generator = null, int? seed = null) + { + Contracts.CheckUserArg(newDim > 0, nameof(newDim), "must be positive."); + InputColumnName = inputColumnName ?? name; + Name = name; + Generator = generator ?? new GaussianFourierSampler.Arguments(); + NewDim = newDim; + UseSin = useSin; + Seed = seed; + } + } + private readonly IHost _host; - private readonly RandomFourierFeaturizingTransformer.ColumnInfo[] _columns; + private readonly ColumnInfo[] _columns; /// - /// Convinence constructor for simple one column case + /// Convinence constructor for simple one column case. /// /// Host Environment. /// Name of the column resulting from the transformation of . /// Name of the column to transform. If set to , the value of the will be used as source. /// The number of random Fourier features to create. /// Create two features for every random Fourier frequency? (one for cos and one for sin). - public RandomFourierFeaturizingEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName = null, int newDim = Defaults.NewDim, bool useSin = Defaults.UseSin) - : this(env, new RandomFourierFeaturizingTransformer.ColumnInfo(outputColumnName, newDim, useSin, inputColumnName ?? outputColumnName)) + internal RandomFourierFeaturizingEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName = null, int newDim = Defaults.NewDim, bool useSin = Defaults.UseSin) + : this(env, new ColumnInfo(outputColumnName, newDim, useSin, inputColumnName ?? outputColumnName)) { } - public RandomFourierFeaturizingEstimator(IHostEnvironment env, params RandomFourierFeaturizingTransformer.ColumnInfo[] columns) + internal RandomFourierFeaturizingEstimator(IHostEnvironment env, params ColumnInfo[] columns) { Contracts.CheckValue(env, nameof(env)); _host = env.Register(nameof(RandomFourierFeaturizingEstimator)); _columns = columns; } + /// + /// Trains and returns a . + /// public RandomFourierFeaturizingTransformer Fit(IDataView input) => new RandomFourierFeaturizingTransformer(_host, input, _columns); + /// + /// Returns the of the schema which will be produced by the transformer. + /// Used for schema propagation and verification in a pipeline. + /// public SchemaShape GetOutputSchema(SchemaShape inputSchema) { _host.CheckValue(inputSchema, nameof(inputSchema)); diff --git a/src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs b/src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs index e4da959d74..4a1da4564c 100644 --- a/src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs +++ b/src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs @@ -32,7 +32,7 @@ [assembly: LoadableClass(typeof(IRowMapper), typeof(StopWordsRemovingTransformer), null, typeof(SignatureLoadRowMapper), "Stopwords Remover Transform", StopWordsRemovingTransformer.LoaderSignature)] -[assembly: LoadableClass(CustomStopWordsRemovingTransformer.Summary, typeof(IDataTransform), typeof(CustomStopWordsRemovingTransformer), typeof(CustomStopWordsRemovingTransformer.Arguments), typeof(SignatureDataTransform), +[assembly: LoadableClass(CustomStopWordsRemovingTransformer.Summary, typeof(IDataTransform), typeof(CustomStopWordsRemovingTransformer), typeof(CustomStopWordsRemovingTransformer.Options), typeof(SignatureDataTransform), "Custom Stopwords Remover Transform", "CustomStopWordsRemoverTransform", "CustomStopWords")] [assembly: LoadableClass(CustomStopWordsRemovingTransformer.Summary, typeof(IDataTransform), typeof(CustomStopWordsRemovingTransformer), null, typeof(SignatureLoadDataTransform), @@ -642,7 +642,7 @@ internal abstract class ArgumentsBase public string StopwordsColumn; } - internal sealed class Arguments : ArgumentsBase + internal sealed class Options : ArgumentsBase { [Argument(ArgumentType.Multiple, HelpText = "New column definition(s)", Name = "Column", ShortName = "col", SortOrder = 1)] public Column[] Columns; @@ -713,7 +713,7 @@ private IDataLoader GetLoaderForStopwords(IChannel ch, string dataFile, if (isBinary || isTranspose) { ch.Assert(isBinary != isTranspose); - ch.CheckUserArg(!string.IsNullOrWhiteSpace(stopwordsCol), nameof(Arguments.StopwordsColumn), + ch.CheckUserArg(!string.IsNullOrWhiteSpace(stopwordsCol), nameof(Options.StopwordsColumn), "stopwordsColumn should be specified"); if (isBinary) dataLoader = new BinaryLoader(Host, new BinaryLoader.Arguments(), fileSource); @@ -772,7 +772,7 @@ private void LoadStopWords(IChannel ch, ReadOnlyMemory stopwords, string d warnEmpty = false; } } - ch.CheckUserArg(stopWordsMap.Count > 0, nameof(Arguments.Stopword), "stopwords is empty"); + ch.CheckUserArg(stopWordsMap.Count > 0, nameof(Options.Stopword), "stopwords is empty"); } else { @@ -780,9 +780,9 @@ private void LoadStopWords(IChannel ch, ReadOnlyMemory stopwords, string d var loader = GetLoaderForStopwords(ch, dataFile, loaderFactory, ref srcCol); if (!loader.Schema.TryGetColumnIndex(srcCol, out int colSrcIndex)) - throw ch.ExceptUserArg(nameof(Arguments.StopwordsColumn), "Unknown column '{0}'", srcCol); + throw ch.ExceptUserArg(nameof(Options.StopwordsColumn), "Unknown column '{0}'", srcCol); var typeSrc = loader.Schema[colSrcIndex].Type; - ch.CheckUserArg(typeSrc is TextType, nameof(Arguments.StopwordsColumn), "Must be a scalar text column"); + ch.CheckUserArg(typeSrc is TextType, nameof(Options.StopwordsColumn), "Must be a scalar text column"); // Accumulate the stopwords. using (var cursor = loader.GetRowCursor(loader.Schema[srcCol])) @@ -805,10 +805,13 @@ private void LoadStopWords(IChannel ch, ReadOnlyMemory stopwords, string d } } } - ch.CheckUserArg(stopWordsMap.Count > 0, nameof(Arguments.DataFile), "dataFile is empty"); + ch.CheckUserArg(stopWordsMap.Count > 0, nameof(Options.DataFile), "dataFile is empty"); } } + /// + /// The names of the input output column pairs on which this transformation is applied. + /// public IReadOnlyCollection<(string outputColumnName, string inputColumnName)> Columns => ColumnPairs.AsReadOnly(); /// @@ -817,7 +820,7 @@ private void LoadStopWords(IChannel ch, ReadOnlyMemory stopwords, string d /// The environment. /// Array of words to remove. /// Pairs of columns to remove stop words from. - public CustomStopWordsRemovingTransformer(IHostEnvironment env, string[] stopwords, params (string outputColumnName, string inputColumnName)[] columns) : + internal CustomStopWordsRemovingTransformer(IHostEnvironment env, string[] stopwords, params (string outputColumnName, string inputColumnName)[] columns) : base(Contracts.CheckRef(env, nameof(env)).Register(RegistrationName), columns) { _stopWordsMap = new NormStr.Pool(); @@ -938,24 +941,24 @@ private static CustomStopWordsRemovingTransformer Create(IHostEnvironment env, M } // Factory method for SignatureDataTransform. - internal static IDataTransform Create(IHostEnvironment env, Arguments args, IDataView input) + internal static IDataTransform Create(IHostEnvironment env, Options options, IDataView input) { Contracts.CheckValue(env, nameof(env)); - env.CheckValue(args, nameof(args)); + env.CheckValue(options, nameof(options)); env.CheckValue(input, nameof(input)); - env.CheckValue(args.Columns, nameof(args.Columns)); - var cols = new (string outputColumnName, string inputColumnName)[args.Columns.Length]; + env.CheckValue(options.Columns, nameof(options.Columns)); + var cols = new (string outputColumnName, string inputColumnName)[options.Columns.Length]; for (int i = 0; i < cols.Length; i++) { - var item = args.Columns[i]; + var item = options.Columns[i]; cols[i] = (item.Name, item.Source ?? item.Name); } CustomStopWordsRemovingTransformer transfrom = null; - if (Utils.Size(args.Stopwords) > 0) - transfrom = new CustomStopWordsRemovingTransformer(env, args.Stopwords, cols); + if (Utils.Size(options.Stopwords) > 0) + transfrom = new CustomStopWordsRemovingTransformer(env, options.Stopwords, cols); else - transfrom = new CustomStopWordsRemovingTransformer(env, args.Stopword, args.DataFile, args.StopwordsColumn, args.Loader, cols); + transfrom = new CustomStopWordsRemovingTransformer(env, options.Stopword, options.DataFile, options.StopwordsColumn, options.Loader, cols); return transfrom.MakeDataTransform(input); } @@ -1057,7 +1060,7 @@ public sealed class CustomStopWordsRemovingEstimator : TrivialEstimatorName of the column resulting from the transformation of . /// Name of the column to transform. If set to , the value of the will be used as source. /// Array of words to remove. - public CustomStopWordsRemovingEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName = null, params string[] stopwords) + internal CustomStopWordsRemovingEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName = null, params string[] stopwords) : this(env, new[] { (outputColumnName, inputColumnName ?? outputColumnName) }, stopwords) { } @@ -1069,11 +1072,15 @@ public CustomStopWordsRemovingEstimator(IHostEnvironment env, string outputColum /// The environment. /// Pairs of columns to remove stop words on. /// Array of words to remove. - public CustomStopWordsRemovingEstimator(IHostEnvironment env, (string outputColumnName, string inputColumnName)[] columns, string[] stopwords) : + internal CustomStopWordsRemovingEstimator(IHostEnvironment env, (string outputColumnName, string inputColumnName)[] columns, string[] stopwords) : base(Contracts.CheckRef(env, nameof(env)).Register(nameof(CustomStopWordsRemovingEstimator)), new CustomStopWordsRemovingTransformer(env, stopwords, columns)) { } + /// + /// Returns the of the schema which will be produced by the transformer. + /// Used for schema propagation and verification in a pipeline. + /// public override SchemaShape GetOutputSchema(SchemaShape inputSchema) { Host.CheckValue(inputSchema, nameof(inputSchema)); diff --git a/src/Microsoft.ML.Transforms/Text/TextFeaturizingEstimator.cs b/src/Microsoft.ML.Transforms/Text/TextFeaturizingEstimator.cs index 25931c5787..7c9524e872 100644 --- a/src/Microsoft.ML.Transforms/Text/TextFeaturizingEstimator.cs +++ b/src/Microsoft.ML.Transforms/Text/TextFeaturizingEstimator.cs @@ -409,13 +409,13 @@ public ITransformer Fit(IDataView input) if (tparams.VectorNormalizer != TextNormKind.None) { - var xfCols = new List(2); + var xfCols = new List(2); if (charFeatureCol != null) { var dstCol = GenerateColumnName(view.Schema, charFeatureCol, "LpCharNorm"); tempCols.Add(dstCol); - xfCols.Add(new LpNormalizingTransformer.LpNormColumnInfo(dstCol, charFeatureCol, normalizerKind: tparams.LpNormalizerKind)); + xfCols.Add(new LpNormalizingEstimator.LpNormColumnInfo(dstCol, charFeatureCol, normalizerKind: tparams.LpNormalizerKind)); charFeatureCol = dstCol; } @@ -423,7 +423,7 @@ public ITransformer Fit(IDataView input) { var dstCol = GenerateColumnName(view.Schema, wordFeatureCol, "LpWordNorm"); tempCols.Add(dstCol); - xfCols.Add(new LpNormalizingTransformer.LpNormColumnInfo(dstCol, wordFeatureCol, normalizerKind: tparams.LpNormalizerKind)); + xfCols.Add(new LpNormalizingEstimator.LpNormColumnInfo(dstCol, wordFeatureCol, normalizerKind: tparams.LpNormalizerKind)); wordFeatureCol = dstCol; } diff --git a/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv b/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv index 1c8470a58d..2ddb2bb4b1 100644 --- a/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv +++ b/test/BaselineOutput/Common/EntryPoints/core_ep-list.tsv @@ -70,7 +70,7 @@ Trainers.StochasticDualCoordinateAscentClassifier The SDCA linear multi-class cl Trainers.StochasticDualCoordinateAscentRegressor The SDCA linear regression trainer. Microsoft.ML.Trainers.Sdca TrainRegression Microsoft.ML.Trainers.SdcaRegressionTrainer+Options Microsoft.ML.EntryPoints.CommonOutputs+RegressionOutput Trainers.StochasticGradientDescentBinaryClassifier Train an Hogwild SGD binary model. Microsoft.ML.Trainers.StochasticGradientDescentClassificationTrainer TrainBinary Microsoft.ML.Trainers.StochasticGradientDescentClassificationTrainer+Options Microsoft.ML.EntryPoints.CommonOutputs+BinaryClassificationOutput Trainers.SymSgdBinaryClassifier Train a symbolic SGD. Microsoft.ML.Trainers.SymSgd.SymSgdClassificationTrainer TrainSymSgd Microsoft.ML.Trainers.SymSgd.SymSgdClassificationTrainer+Options Microsoft.ML.EntryPoints.CommonOutputs+BinaryClassificationOutput -Transforms.ApproximateBootstrapSampler Approximate bootstrap sampling. Microsoft.ML.Transforms.BootstrapSample GetSample Microsoft.ML.Transforms.BootstrapSamplingTransformer+Arguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput +Transforms.ApproximateBootstrapSampler Approximate bootstrap sampling. Microsoft.ML.Transforms.BootstrapSample GetSample Microsoft.ML.Transforms.BootstrapSamplingTransformer+Options Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.BinaryPredictionScoreColumnsRenamer For binary prediction, it renames the PredictedLabel and Score columns to include the name of the positive class. Microsoft.ML.EntryPoints.ScoreModel RenameBinaryPredictionScoreColumns Microsoft.ML.EntryPoints.ScoreModel+RenameBinaryPredictionScoreColumnsInput Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.BinNormalizer The values are assigned into equidensity bins and a value is mapped to its bin_number/number_of_bins. Microsoft.ML.Data.Normalize Bin Microsoft.ML.Transforms.Normalizers.NormalizeTransform+BinArguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.CategoricalHashOneHotVectorizer Converts the categorical value into an indicator array by hashing the value and using the hash as an index in the bag. If the input column is a vector, a single indicator bag is returned for it. Microsoft.ML.Transforms.Categorical.Categorical CatTransformHash Microsoft.ML.Transforms.Categorical.OneHotHashEncodingTransformer+Options Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput @@ -90,7 +90,7 @@ Transforms.FeatureCombiner Combines all the features into one feature column. Mi Transforms.FeatureContributionCalculationTransformer For each data point, calculates the contribution of individual features to the model prediction. Microsoft.ML.Data.FeatureContributionEntryPoint FeatureContributionCalculation Microsoft.ML.Data.FeatureContributionCalculatingTransformer+Arguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.FeatureSelectorByCount Selects the slots for which the count of non-default values is greater than or equal to a threshold. Microsoft.ML.Transforms.SelectFeatures CountSelect Microsoft.ML.Transforms.FeatureSelection.CountFeatureSelectingEstimator+Arguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.FeatureSelectorByMutualInformation Selects the top k slots across all specified columns ordered by their mutual information with the label column. Microsoft.ML.Transforms.SelectFeatures MutualInformationSelect Microsoft.ML.Transforms.FeatureSelection.MutualInformationFeatureSelectingEstimator+Arguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput -Transforms.GlobalContrastNormalizer Performs a global contrast normalization on input values: Y = (s * X - M) / D, where s is a scale, M is mean and D is either L2 norm or standard deviation. Microsoft.ML.Transforms.Projections.LpNormalization GcNormalize Microsoft.ML.Transforms.Projections.LpNormalizingTransformer+GcnArguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput +Transforms.GlobalContrastNormalizer Performs a global contrast normalization on input values: Y = (s * X - M) / D, where s is a scale, M is mean and D is either L2 norm or standard deviation. Microsoft.ML.Transforms.Projections.LpNormalization GcNormalize Microsoft.ML.Transforms.Projections.LpNormalizingTransformer+GcnOptions Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.HashConverter Converts column values into hashes. This transform accepts both numeric and text inputs, both single and vector-valued columns. Microsoft.ML.Transforms.Conversions.HashJoin Apply Microsoft.ML.Transforms.Conversions.HashJoiningTransform+Arguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.ImageGrayscale Convert image into grayscale. Microsoft.ML.ImageAnalytics.EntryPoints.ImageAnalytics ImageGrayscale Microsoft.ML.ImageAnalytics.ImageGrayscalingTransformer+Options Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.ImageLoader Load images from files. Microsoft.ML.ImageAnalytics.EntryPoints.ImageAnalytics ImageLoader Microsoft.ML.ImageAnalytics.ImageLoadingTransformer+Options Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput @@ -102,7 +102,7 @@ Transforms.LabelIndicator Label remapper used by OVA Microsoft.ML.Transforms.Lab Transforms.LabelToFloatConverter Transforms the label to float to make it suitable for regression. Microsoft.ML.EntryPoints.FeatureCombiner PrepareRegressionLabel Microsoft.ML.EntryPoints.FeatureCombiner+RegressionLabelInput Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.LightLda The LDA transform implements LightLDA, a state-of-the-art implementation of Latent Dirichlet Allocation. Microsoft.ML.Transforms.Text.TextAnalytics LightLda Microsoft.ML.Transforms.Text.LatentDirichletAllocationTransformer+Arguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.LogMeanVarianceNormalizer Normalizes the data based on the computed mean and variance of the logarithm of the data. Microsoft.ML.Data.Normalize LogMeanVar Microsoft.ML.Transforms.Normalizers.NormalizeTransform+LogMeanVarArguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput -Transforms.LpNormalizer Normalize vectors (rows) individually by rescaling them to unit norm (L2, L1 or LInf). Performs the following operation on a vector X: Y = (X - M) / D, where M is mean and D is either L2 norm, L1 norm or LInf norm. Microsoft.ML.Transforms.Projections.LpNormalization Normalize Microsoft.ML.Transforms.Projections.LpNormalizingTransformer+Arguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput +Transforms.LpNormalizer Normalize vectors (rows) individually by rescaling them to unit norm (L2, L1 or LInf). Performs the following operation on a vector X: Y = (X - M) / D, where M is mean and D is either L2 norm, L1 norm or LInf norm. Microsoft.ML.Transforms.Projections.LpNormalization Normalize Microsoft.ML.Transforms.Projections.LpNormalizingTransformer+Options Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.ManyHeterogeneousModelCombiner Combines a sequence of TransformModels and a PredictorModel into a single PredictorModel. Microsoft.ML.EntryPoints.ModelOperations CombineModels Microsoft.ML.EntryPoints.ModelOperations+PredictorModelInput Microsoft.ML.EntryPoints.ModelOperations+PredictorModelOutput Transforms.MeanVarianceNormalizer Normalizes the data based on the computed mean and variance of the data. Microsoft.ML.Data.Normalize MeanVar Microsoft.ML.Transforms.Normalizers.NormalizeTransform+MeanVarArguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.MinMaxNormalizer Normalizes the data based on the observed minimum and maximum values of the data. Microsoft.ML.Data.Normalize MinMax Microsoft.ML.Transforms.Normalizers.NormalizeTransform+MinMaxArguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput @@ -115,7 +115,7 @@ Transforms.ModelCombiner Combines a sequence of TransformModels into a single mo Transforms.NGramTranslator Produces a bag of counts of ngrams (sequences of consecutive values of length 1-n) in a given vector of keys. It does so by building a dictionary of ngrams and using the id in the dictionary as the index in the bag. Microsoft.ML.Transforms.Text.TextAnalytics NGramTransform Microsoft.ML.Transforms.Text.NgramExtractingTransformer+Arguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.NoOperation Does nothing. Microsoft.ML.Data.NopTransform Nop Microsoft.ML.Data.NopTransform+NopInput Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.OptionalColumnCreator If the source column does not exist after deserialization, create a column with the right type and default values. Microsoft.ML.Transforms.OptionalColumnTransform MakeOptional Microsoft.ML.Transforms.OptionalColumnTransform+Arguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput -Transforms.PcaCalculator PCA is a dimensionality-reduction transform which computes the projection of a numeric vector onto a low-rank subspace. Microsoft.ML.Transforms.Projections.PcaTransformer Calculate Microsoft.ML.Transforms.Projections.PcaTransformer+Arguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput +Transforms.PcaCalculator PCA is a dimensionality-reduction transform which computes the projection of a numeric vector onto a low-rank subspace. Microsoft.ML.Transforms.Projections.PrincipalComponentAnalysisTransformer Calculate Microsoft.ML.Transforms.Projections.PrincipalComponentAnalysisTransformer+Options Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.PredictedLabelColumnOriginalValueConverter Transforms a predicted label column to its original values, unless it is of type bool. Microsoft.ML.EntryPoints.FeatureCombiner ConvertPredictedLabel Microsoft.ML.EntryPoints.FeatureCombiner+PredictedLabelInput Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.RandomNumberGenerator Adds a column with a generated number sequence. Microsoft.ML.Transforms.RandomNumberGenerator Generate Microsoft.ML.Transforms.GenerateNumberTransform+Arguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput Transforms.RowRangeFilter Filters a dataview on a column of type Single, Double or Key (contiguous). Keeps the values that are in the specified min/max range. NaNs are always filtered out. If the input is a Key type, the min/max are considered percentages of the number of values. Microsoft.ML.EntryPoints.SelectRows FilterByRange Microsoft.ML.Transforms.RangeFilter+Arguments Microsoft.ML.EntryPoints.CommonOutputs+TransformOutput diff --git a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs index b9c43e2653..b08505e149 100644 --- a/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs +++ b/test/Microsoft.ML.Core.Tests/UnitTests/TestEntryPoints.cs @@ -733,7 +733,6 @@ public void EntryPointCalibrate() var scoredFf = ScoreModel.Score(Env, new ScoreModel.Input() { Data = splitOutput.TestData[2], PredictorModel = twiceCalibratedFfModel }).ScoredData; } - [Fact] public void EntryPointPipelineEnsemble() { @@ -746,8 +745,8 @@ public void EntryPointPipelineEnsemble() { var data = splitOutput.TrainData[i]; data = new RandomFourierFeaturizingEstimator(Env, new[] { - new RandomFourierFeaturizingTransformer.ColumnInfo("Features1", 10, false, "Features"), - new RandomFourierFeaturizingTransformer.ColumnInfo("Features2", 10, false, "Features"), + new RandomFourierFeaturizingEstimator.ColumnInfo("Features1", 10, false, "Features"), + new RandomFourierFeaturizingEstimator.ColumnInfo("Features2", 10, false, "Features"), }).Fit(data).Transform(data); data = new ColumnConcatenatingTransformer(Env, "Features", new[] { "Features1", "Features2" }).Transform(data); @@ -1198,8 +1197,8 @@ public void EntryPointMulticlassPipelineEnsemble() { var data = splitOutput.TrainData[i]; data = new RandomFourierFeaturizingEstimator(Env, new[] { - new RandomFourierFeaturizingTransformer.ColumnInfo("Features1", 10, false, "Features"), - new RandomFourierFeaturizingTransformer.ColumnInfo("Features2", 10, false, "Features"), + new RandomFourierFeaturizingEstimator.ColumnInfo("Features1", 10, false, "Features"), + new RandomFourierFeaturizingEstimator.ColumnInfo("Features2", 10, false, "Features"), }).Fit(data).Transform(data); data = new ColumnConcatenatingTransformer(Env, "Features", new[] { "Features1", "Features2" }).Transform(data); diff --git a/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs b/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs index 0b95aff834..78120af00b 100644 --- a/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs +++ b/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs @@ -527,14 +527,14 @@ private void ExecuteTFTransformMNISTConvTrainingTest(bool shuffle, int? shuffleS if (shuffle) { // Shuffle training data set - preprocessedTrainData = new RowShufflingTransformer(mlContext, new RowShufflingTransformer.Arguments() + preprocessedTrainData = new RowShufflingTransformer(mlContext, new RowShufflingTransformer.Options() { ForceShuffle = shuffle, ForceShuffleSeed = shuffleSeed }, trainData); // Shuffle test data set - preprocessedTestData = new RowShufflingTransformer(mlContext, new RowShufflingTransformer.Arguments() + preprocessedTestData = new RowShufflingTransformer(mlContext, new RowShufflingTransformer.Options() { ForceShuffle = shuffle, ForceShuffleSeed = shuffleSeed diff --git a/test/Microsoft.ML.Tests/Transformers/NormalizerTests.cs b/test/Microsoft.ML.Tests/Transformers/NormalizerTests.cs index f3351ce34d..eb37308afd 100644 --- a/test/Microsoft.ML.Tests/Transformers/NormalizerTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/NormalizerTests.cs @@ -273,8 +273,8 @@ public void LpGcNormAndWhiteningWorkout() separator: ';', hasHeader: true) .Read(dataSource); - var est = new LpNormalizingEstimator(ML, "lpnorm", "features") - .Append(new GlobalContrastNormalizingEstimator(ML, "gcnorm", "features")) + var est = ML.Transforms.Projection.LpNormalize("lpnorm", "features") + .Append(ML.Transforms.Projection.GlobalContrastNormalize("gcnorm", "features")) .Append(new VectorWhiteningEstimator(ML, "whitened", "features")); TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); @@ -369,8 +369,8 @@ public void LpNormWorkout() separator: ';', hasHeader: true) .Read(dataSource); - var est = new LpNormalizingEstimator(ML, "lpNorm1", "features") - .Append(new LpNormalizingEstimator(ML, "lpNorm2", "features", normKind: LpNormalizingEstimatorBase.NormalizerKind.L1Norm, substractMean: true)); + var est = ML.Transforms.Projection.LpNormalize("lpNorm1", "features") + .Append(ML.Transforms.Projection.LpNormalize("lpNorm2", "features", normKind: LpNormalizingEstimatorBase.NormalizerKind.L1Norm, subMean: true)); TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("NormalizerEstimator", "lpNorm.tsv"); @@ -402,7 +402,7 @@ public void TestLpNormOldSavingAndLoading() c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), separator: ';', hasHeader: true) .Read(dataSource).AsDynamic; - var pipe = new LpNormalizingEstimator(ML, "whitened", "features"); + var pipe = ML.Transforms.Projection.LpNormalize("whitened", "features"); var result = pipe.Fit(dataView).Transform(dataView); var resultRoles = new RoleMappedData(result); @@ -428,8 +428,8 @@ public void GcnWorkout() separator: ';', hasHeader: true) .Read(dataSource); - var est = new GlobalContrastNormalizingEstimator(ML, "gcnNorm1", "features") - .Append(new GlobalContrastNormalizingEstimator(ML, "gcnNorm2", "features", substractMean: false, useStdDev: true, scale: 3)); + var est = ML.Transforms.Projection.GlobalContrastNormalize("gcnNorm1", "features") + .Append(ML.Transforms.Projection.GlobalContrastNormalize("gcnNorm2", "features", substractMean: false, useStdDev: true, scale: 3)); TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); var outputPath = GetOutputPath("NormalizerEstimator", "gcnNorm.tsv"); @@ -461,7 +461,7 @@ public void TestGcnNormOldSavingAndLoading() c => (label: c.LoadFloat(11), features: c.LoadFloat(0, 10)), separator: ';', hasHeader: true) .Read(dataSource).AsDynamic; - var pipe = new GlobalContrastNormalizingEstimator(ML, "whitened", "features"); + var pipe = ML.Transforms.Projection.GlobalContrastNormalize("whitened", "features"); var result = pipe.Fit(dataView).Transform(dataView); var resultRoles = new RoleMappedData(result); diff --git a/test/Microsoft.ML.Tests/Transformers/PcaTests.cs b/test/Microsoft.ML.Tests/Transformers/PcaTests.cs index d3214b889e..f349282caa 100644 --- a/test/Microsoft.ML.Tests/Transformers/PcaTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/PcaTests.cs @@ -40,10 +40,10 @@ public void PcaWorkout() separator: ';', hasHeader: true) .Read(_dataSource); - var est = new PrincipalComponentAnalysisEstimator(_env, "pca", "features", rank: 4, seed: 10); + var est = ML.Transforms.Projection.ProjectToPrincipalComponents("pca", "features", rank: 4, seed: 10); TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); - var estNonDefaultArgs = new PrincipalComponentAnalysisEstimator(_env, "pca", "features", rank: 3, weightColumn: "weight", overSampling: 2, center: false); + var estNonDefaultArgs = ML.Transforms.Projection.ProjectToPrincipalComponents("pca", "features", rank: 3, weightColumn: "weight", overSampling: 2, center: false); TestEstimatorCore(estNonDefaultArgs, data.AsDynamic, invalidInput: invalidData.AsDynamic); Done(); @@ -57,7 +57,7 @@ public void TestPcaEstimator() separator: ';', hasHeader: true) .Read(_dataSource); - var est = new PrincipalComponentAnalysisEstimator(_env, "pca", "features", rank: 5, seed: 1); + var est = ML.Transforms.Projection.ProjectToPrincipalComponents("pca", "features", rank: 5, seed: 1); var outputPath = GetOutputPath("PCA", "pca.tsv"); using (var ch = _env.Start("save")) { diff --git a/test/Microsoft.ML.Tests/Transformers/RffTests.cs b/test/Microsoft.ML.Tests/Transformers/RffTests.cs index 68af36863b..d89d7b9676 100644 --- a/test/Microsoft.ML.Tests/Transformers/RffTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/RffTests.cs @@ -55,9 +55,9 @@ public void RffWorkout() var dataView = ML.Data.ReadFromEnumerable(data); var generator = new GaussianFourierSampler.Arguments(); - var pipe = new RandomFourierFeaturizingEstimator(Env, new[]{ - new RandomFourierFeaturizingTransformer.ColumnInfo("RffA", 5, false, "A"), - new RandomFourierFeaturizingTransformer.ColumnInfo("RffB", 10, true, "A", new LaplacianFourierSampler.Arguments()) + var pipe = ML.Transforms.Projection.CreateRandomFourierFeatures(new[]{ + new RandomFourierFeaturizingEstimator.ColumnInfo("RffA", 5, false, "A"), + new RandomFourierFeaturizingEstimator.ColumnInfo("RffB", 10, true, "A", new LaplacianFourierSampler.Arguments()) }); TestEstimatorCore(pipe, dataView, invalidInput: invalidData, validForFitNotValidForTransformInput: validFitInvalidData); @@ -111,9 +111,9 @@ public void TestOldSavingAndLoading() }; var dataView = ML.Data.ReadFromEnumerable(data); - var est = new RandomFourierFeaturizingEstimator(Env, new[]{ - new RandomFourierFeaturizingTransformer.ColumnInfo("RffA", 5, false, "A"), - new RandomFourierFeaturizingTransformer.ColumnInfo("RffB", 10, true, "A", new LaplacianFourierSampler.Arguments()) + var est = ML.Transforms.Projection.CreateRandomFourierFeatures(new[]{ + new RandomFourierFeaturizingEstimator.ColumnInfo("RffA", 5, false, "A"), + new RandomFourierFeaturizingEstimator.ColumnInfo("RffB", 10, true, "A", new LaplacianFourierSampler.Arguments()) }); var result = est.Fit(dataView).Transform(dataView); var resultRoles = new RoleMappedData(result);