diff --git a/src/Microsoft.ML.Api/DataViewConstructionUtils.cs b/src/Microsoft.ML.Api/DataViewConstructionUtils.cs index 612be99faf..a4bc8cfda5 100644 --- a/src/Microsoft.ML.Api/DataViewConstructionUtils.cs +++ b/src/Microsoft.ML.Api/DataViewConstructionUtils.cs @@ -207,7 +207,16 @@ private Delegate CreateGetter(ColumnType colType, InternalSchemaDefinition.Colum Host.Assert(colType.RawType == Nullable.GetUnderlyingType(outputType)); else Host.Assert(colType.RawType == outputType); - del = CreateDirectGetterDelegate; + + if (!colType.IsKey) + del = CreateDirectGetterDelegate; + else + { + var keyRawType = colType.RawType; + Host.Assert(colType.AsKey.Contiguous); + Func delForKey = CreateKeyGetterDelegate; + return Utils.MarshalInvoke(delForKey, keyRawType, peek, colType); + } } else { @@ -288,6 +297,38 @@ private Delegate CreateDirectGetterDelegate(Delegate peekDel) peek(GetCurrentRowObject(), Position, ref dst)); } + private Delegate CreateKeyGetterDelegate(Delegate peekDel, ColumnType colType) + { + // Make sure the function is dealing with key. + Host.Check(colType.IsKey); + // Following equations work only with contiguous key type. + Host.Check(colType.AsKey.Contiguous); + // Following equations work only with unsigned integers. + Host.Check(typeof(TDst) == typeof(ulong) || typeof(TDst) == typeof(uint) || + typeof(TDst) == typeof(byte) || typeof(TDst) == typeof(bool)); + + // Convert delegate function to a function which can fetch the underlying value. + var peek = peekDel as Peek; + Host.AssertValue(peek); + + TDst rawKeyValue = default; + ulong key = 0; // the raw key value as ulong + ulong min = colType.AsKey.Min; + ulong max = min + (ulong)colType.AsKey.Count - 1; + ulong result = 0; // the result as ulong + ValueGetter getter = (ref TDst dst) => + { + peek(GetCurrentRowObject(), Position, ref rawKeyValue); + key = (ulong)Convert.ChangeType(rawKeyValue, typeof(ulong)); + if (min <= key && key <= max) + result = key - min + 1; + else + result = 0; + dst = (TDst)Convert.ChangeType(result, typeof(TDst)); + }; + return getter; + } + protected abstract TRow GetCurrentRowObject(); public bool IsColumnActive(int col) diff --git a/src/Microsoft.ML.Recommender/MatrixFactorizationPredictor.cs b/src/Microsoft.ML.Recommender/MatrixFactorizationPredictor.cs index 44da79c703..6b71a165c6 100644 --- a/src/Microsoft.ML.Recommender/MatrixFactorizationPredictor.cs +++ b/src/Microsoft.ML.Recommender/MatrixFactorizationPredictor.cs @@ -13,7 +13,6 @@ using Microsoft.ML.Runtime.Model; using Microsoft.ML.Runtime.Recommender; using Microsoft.ML.Runtime.Recommender.Internal; -using Microsoft.ML.Trainers; using Microsoft.ML.Trainers.Recommender; [assembly: LoadableClass(typeof(MatrixFactorizationPredictor), null, typeof(SignatureLoadModel), "Matrix Factorization Predictor Executor", MatrixFactorizationPredictor.LoaderSignature)] @@ -347,9 +346,12 @@ private Delegate[] CreateGetter(IRow input, bool[] active) var getters = new Delegate[1]; if (active[0]) { + // First check if expected columns are ok and then create getters to acccess those columns' values. CheckInputSchema(input.Schema, _matrixColumnIndexColumnIndex, _matrixRowIndexCololumnIndex); - var matrixColumnIndexGetter = input.GetGetter(_matrixColumnIndexColumnIndex); - var matrixRowIndexGetter = input.GetGetter(_matrixRowIndexCololumnIndex); + var matrixColumnIndexGetter = RowCursorUtils.GetGetterAs(NumberType.U4, input, _matrixColumnIndexColumnIndex); + var matrixRowIndexGetter = RowCursorUtils.GetGetterAs(NumberType.U4, input, _matrixRowIndexCololumnIndex); + + // Assign the getter of the prediction score. It maps a pair of matrix column index and matrix row index to a scalar. getters[0] = _parent.GetGetter(matrixColumnIndexGetter, matrixRowIndexGetter); } return getters; diff --git a/src/Microsoft.ML.Recommender/MatrixFactorizationTrainer.cs b/src/Microsoft.ML.Recommender/MatrixFactorizationTrainer.cs index 5d32a4fd66..6750f98068 100644 --- a/src/Microsoft.ML.Recommender/MatrixFactorizationTrainer.cs +++ b/src/Microsoft.ML.Recommender/MatrixFactorizationTrainer.cs @@ -29,44 +29,50 @@ public sealed class MatrixFactorizationTrainer : TrainerBase - /// The row, column, and label columns that the trainer expects. This module uses tuples of (row index, column index, label value) to specify a matrix. + /// The row index, column index, and label columns needed to specify the training matrix. This trainer uses tuples of (row index, column index, label value) to specify a matrix. /// For example, a 2-by-2 matrix /// [9, 4] /// [8, 7] /// can be encoded as tuples (0, 0, 9), (0, 1, 4), (1, 0, 8), and (1, 1, 7). It means that the row/column/label column contains [0, 0, 1, 1]/ /// [0, 1, 0, 1]/[9, 4, 8, 7]. /// - public readonly SchemaShape.Column MatrixColumnIndexColumn; // column indices of the training matrix - public readonly SchemaShape.Column MatrixRowIndexColumn; // row indices of the training matrix - public readonly SchemaShape.Column LabelColumn; + + /// + /// The name of variable (i.e., Column in a type system) used be as matrix's column index. + /// + public readonly string MatrixColumnIndexName; + + /// + /// The name of variable (i.e., column in a type system) used as matrix's row index. + /// + public readonly string MatrixRowIndexName; + + /// + /// The name variable (i.e., column in a type system) used as matrix's element value. + /// + public readonly string LabelName; /// /// The contains general parameters for this trainer. @@ -95,7 +113,7 @@ public sealed class Arguments /// Extra information the trainer can use. For example, its validation set (if not null) can be use to evaluate the /// training progress made at each training iteration. /// - public readonly TrainerEstimatorContext Context; + private readonly TrainerEstimatorContext _context; /// /// Legacy constructor initializing a new instance of through the legacy @@ -149,11 +167,11 @@ public MatrixFactorizationTrainer(IHostEnvironment env, string labelColumn, stri _doNmf = args.NonNegative; Info = new TrainerInfo(normalization: false, caching: false); - Context = context; + _context = context; - LabelColumn = new SchemaShape.Column(labelColumn, SchemaShape.Column.VectorKind.Scalar, NumberType.R4, false); - MatrixColumnIndexColumn = new SchemaShape.Column(matrixColumnIndexColumnName, SchemaShape.Column.VectorKind.Scalar, NumberType.U4, true); - MatrixRowIndexColumn = new SchemaShape.Column(matrixRowIndexColumnName, SchemaShape.Column.VectorKind.Scalar, NumberType.U4, true); + LabelName = labelColumn; + MatrixColumnIndexName = matrixColumnIndexColumnName; + MatrixRowIndexName = matrixRowIndexColumnName; } /// @@ -210,22 +228,21 @@ private MatrixFactorizationPredictor TrainCore(IChannel ch, RoleMappedData data, int rowCount = matrixRowIndexColInfo.Type.KeyCount; ch.Assert(rowCount > 0); ch.Assert(colCount > 0); - // Checks for equality on the validation set ensure it is correct here. + // Checks for equality on the validation set ensure it is correct here. using (var cursor = data.Data.GetRowCursor(c => c == matrixColumnIndexColInfo.Index || c == matrixRowIndexColInfo.Index || c == data.Schema.Label.Index)) { // LibMF works only over single precision floats, but we want to be able to consume either. - ValueGetter labGetter = RowCursorUtils.GetGetterAs(NumberType.R4, cursor, data.Schema.Label.Index); - var matrixColumnIndexGetter = cursor.GetGetter(matrixColumnIndexColInfo.Index); - var matrixRowIndexGetter = cursor.GetGetter(matrixRowIndexColInfo.Index); + var labGetter = RowCursorUtils.GetGetterAs(NumberType.R4, cursor, data.Schema.Label.Index); + var matrixColumnIndexGetter = RowCursorUtils.GetGetterAs(NumberType.U4, cursor, matrixColumnIndexColInfo.Index); + var matrixRowIndexGetter = RowCursorUtils.GetGetterAs(NumberType.U4, cursor, matrixRowIndexColInfo.Index); if (validData == null) { // Have the trainer do its work. using (var buffer = PrepareBuffer()) { - buffer.Train(ch, rowCount, colCount, - cursor, labGetter, matrixRowIndexGetter, matrixColumnIndexGetter); + buffer.Train(ch, rowCount, colCount, cursor, labGetter, matrixRowIndexGetter, matrixColumnIndexGetter); predictor = new MatrixFactorizationPredictor(Host, buffer, matrixColumnIndexColInfo.Type.AsKey, matrixRowIndexColInfo.Type.AsKey); } } @@ -234,16 +251,16 @@ private MatrixFactorizationPredictor TrainCore(IChannel ch, RoleMappedData data, using (var validCursor = validData.Data.GetRowCursor( c => c == validMatrixColumnIndexColInfo.Index || c == validMatrixRowIndexColInfo.Index || c == validData.Schema.Label.Index)) { - ValueGetter validLabGetter = RowCursorUtils.GetGetterAs(NumberType.R4, validCursor, validData.Schema.Label.Index); - var validXGetter = validCursor.GetGetter(validMatrixColumnIndexColInfo.Index); - var validYGetter = validCursor.GetGetter(validMatrixRowIndexColInfo.Index); + ValueGetter validLabelGetter = RowCursorUtils.GetGetterAs(NumberType.R4, validCursor, validData.Schema.Label.Index); + var validMatrixColumnIndexGetter = RowCursorUtils.GetGetterAs(NumberType.U4, validCursor, validMatrixColumnIndexColInfo.Index); + var validMatrixRowIndexGetter = RowCursorUtils.GetGetterAs(NumberType.U4, validCursor, validMatrixRowIndexColInfo.Index); // Have the trainer do its work. using (var buffer = PrepareBuffer()) { buffer.TrainWithValidation(ch, rowCount, colCount, cursor, labGetter, matrixRowIndexGetter, matrixColumnIndexGetter, - validCursor, validLabGetter, validYGetter, validXGetter); + validCursor, validLabelGetter, validMatrixRowIndexGetter, validMatrixColumnIndexGetter); predictor = new MatrixFactorizationPredictor(Host, buffer, matrixColumnIndexColInfo.Type.AsKey, matrixRowIndexColInfo.Type.AsKey); } } @@ -268,12 +285,12 @@ public MatrixFactorizationPredictionTransformer Fit(IDataView input) MatrixFactorizationPredictor model = null; var roles = new List>(); - roles.Add(new KeyValuePair(RoleMappedSchema.ColumnRole.Label, LabelColumn.Name)); - roles.Add(new KeyValuePair(RecommenderUtils.MatrixColumnIndexKind.Value, MatrixColumnIndexColumn.Name)); - roles.Add(new KeyValuePair(RecommenderUtils.MatrixRowIndexKind.Value, MatrixRowIndexColumn.Name)); + roles.Add(new KeyValuePair(RoleMappedSchema.ColumnRole.Label, LabelName)); + roles.Add(new KeyValuePair(RecommenderUtils.MatrixColumnIndexKind.Value, MatrixColumnIndexName)); + roles.Add(new KeyValuePair(RecommenderUtils.MatrixRowIndexKind.Value, MatrixRowIndexName)); var trainingData = new RoleMappedData(input, roles); - var validData = Context == null ? null : new RoleMappedData(Context.ValidationSet, roles); + var validData = _context == null ? null : new RoleMappedData(_context.ValidationSet, roles); using (var ch = Host.Start("Training")) using (var pch = Host.StartProgressChannel("Training")) @@ -281,7 +298,7 @@ public MatrixFactorizationPredictionTransformer Fit(IDataView input) model = TrainCore(ch, trainingData, validData); } - return new MatrixFactorizationPredictionTransformer(Host, model, input.Schema, MatrixColumnIndexColumn.Name, MatrixRowIndexColumn.Name); + return new MatrixFactorizationPredictionTransformer(Host, model, input.Schema, MatrixColumnIndexName, MatrixRowIndexName); } public SchemaShape GetOutputSchema(SchemaShape inputSchema) @@ -297,13 +314,15 @@ void CheckColumnsCompatible(SchemaShape.Column cachedColumn, string expectedColu throw Host.Except($"{expectedColumnName} column '{cachedColumn.Name}' is not compatible"); } - // In prediction phase, no label column is expected. - if (LabelColumn != null) - CheckColumnsCompatible(LabelColumn, LabelColumn.Name); + // Check if label column is good. + var labelColumn = new SchemaShape.Column(LabelName, SchemaShape.Column.VectorKind.Scalar, NumberType.R4, false); + CheckColumnsCompatible(labelColumn, LabelName); - // In both of training and prediction phases, we need columns of user ID and column ID. - CheckColumnsCompatible(MatrixColumnIndexColumn, MatrixColumnIndexColumn.Name); - CheckColumnsCompatible(MatrixRowIndexColumn, MatrixRowIndexColumn.Name); + // Check if columns of matrix's row and column indexes are good. Note that column of IDataView and column of matrix are two different things. + var matrixColumnIndexColumn = new SchemaShape.Column(MatrixColumnIndexName, SchemaShape.Column.VectorKind.Scalar, NumberType.U4, true); + var matrixRowIndexColumn = new SchemaShape.Column(MatrixRowIndexName, SchemaShape.Column.VectorKind.Scalar, NumberType.U4, true); + CheckColumnsCompatible(matrixColumnIndexColumn, MatrixColumnIndexName); + CheckColumnsCompatible(matrixRowIndexColumn, MatrixRowIndexName); // Input columns just pass through so that output column dictionary contains all input columns. var outColumns = inputSchema.Columns.ToDictionary(x => x.Name); @@ -317,7 +336,7 @@ void CheckColumnsCompatible(SchemaShape.Column cachedColumn, string expectedColu private SchemaShape.Column[] GetOutputColumnsCore(SchemaShape inputSchema) { - bool success = inputSchema.TryFindColumn(LabelColumn.Name, out var labelCol); + bool success = inputSchema.TryFindColumn(LabelName, out var labelCol); Contracts.Assert(success); return new[] diff --git a/test/Microsoft.ML.Tests/TrainerEstimators/MatrixFactorizationTests.cs b/test/Microsoft.ML.Tests/TrainerEstimators/MatrixFactorizationTests.cs index 6c3bf1e2c1..0c612c6a3f 100644 --- a/test/Microsoft.ML.Tests/TrainerEstimators/MatrixFactorizationTests.cs +++ b/test/Microsoft.ML.Tests/TrainerEstimators/MatrixFactorizationTests.cs @@ -202,5 +202,107 @@ public void MatrixFactorizationInMemoryData() // Native test. Just check the pipeline runs. Assert.True(metrics.L2 < 0.1); } + + internal class MatrixElementZeroBased + { + // Matrix column index starts from 0 and is at most _synthesizedMatrixColumnCount-1. + // Contieuous=true means that all values from 0 to _synthesizedMatrixColumnCount-1 are allowed keys. + [KeyType(Contiguous = true, Count = _synthesizedMatrixColumnCount, Min = 0)] + public uint MatrixColumnIndex; + // Matrix row index starts from 0 and is at most _synthesizedMatrixRowCount-1. + // Contieuous=true means that all values from 0 to _synthesizedMatrixRowCount-1 are allowed keys. + [KeyType(Contiguous = true, Count = _synthesizedMatrixRowCount, Min = 0)] + public uint MatrixRowIndex; + // The value at the MatrixColumnIndex-th column and the MatrixRowIndex-th row in the considered matrix. + public float Value; + } + + internal class MatrixElementZeroBasedForScore + { + // Matrix column index starts from 0 and is at most _synthesizedMatrixColumnCount-1. + // Contieuous=true means that all values from 0 to _synthesizedMatrixColumnCount-1 are allowed keys. + [KeyType(Contiguous = true, Count = _synthesizedMatrixColumnCount, Min = 0)] + public uint MatrixColumnIndex; + // Matrix row index starts from 0 and is at most _synthesizedMatrixRowCount-1. + // Contieuous=true means that all values from 0 to _synthesizedMatrixRowCount-1 are allowed keys. + [KeyType(Contiguous = true, Count = _synthesizedMatrixRowCount, Min = 0)] + public uint MatrixRowIndex; + public float Score; + } + + [ConditionalFact(typeof(Environment), nameof(Environment.Is64BitProcess))] // This test is being fixed as part of issue #1441. + public void MatrixFactorizationInMemoryDataZeroBaseIndex() + { + // Create an in-memory matrix as a list of tuples (column index, row index, value). + // Iterators i and j are column and row indexes, respectively. + var dataMatrix = new List(); + for (uint i = 0; i < _synthesizedMatrixColumnCount; ++i) + for (uint j = 0; j < _synthesizedMatrixRowCount; ++j) + dataMatrix.Add(new MatrixElementZeroBased() { MatrixColumnIndex = i, MatrixRowIndex = j, Value = (i + j) % 5 }); + + // Convert the in-memory matrix into an IDataView so that ML.NET components can consume it. + var dataView = ComponentCreation.CreateDataView(Env, dataMatrix); + + // Create a matrix factorization trainer which may consume "Value" as the training label, "MatrixColumnIndex" as the + // matrix's column index, and "MatrixRowIndex" as the matrix's row index. + var mlContext = new MLContext(seed: 1, conc: 1); + var pipeline = new MatrixFactorizationTrainer(mlContext, nameof(MatrixElementZeroBased.Value), + nameof(MatrixElementZeroBased.MatrixColumnIndex), nameof(MatrixElementZeroBased.MatrixRowIndex), + advancedSettings: s => + { + s.NumIterations = 100; + s.NumThreads = 1; // To eliminate randomness, # of threads must be 1. + s.K = 32; + s.Eta = 0.5; + }); + + // Train a matrix factorization model. + var model = pipeline.Fit(dataView); + + // Check if the expected types in the trained model are expected. + Assert.True(model.MatrixColumnIndexColumnName == nameof(MatrixElementZeroBased.MatrixColumnIndex)); + Assert.True(model.MatrixRowIndexColumnName == nameof(MatrixElementZeroBased.MatrixRowIndex)); + Assert.True(model.MatrixColumnIndexColumnType.IsKey); + Assert.True(model.MatrixRowIndexColumnType.IsKey); + var matColKeyType = model.MatrixColumnIndexColumnType.AsKey; + Assert.True(matColKeyType.Min == 0); + Assert.True(matColKeyType.Count == _synthesizedMatrixColumnCount); + var matRowKeyType = model.MatrixRowIndexColumnType.AsKey; + Assert.True(matRowKeyType.Min == 0); + Assert.True(matRowKeyType.Count == _synthesizedMatrixRowCount); + + // Apply the trained model to the training set + var prediction = model.Transform(dataView); + + // Calculate regression matrices for the prediction result. It's a global + var metrics = mlContext.Regression.Evaluate(prediction, label: "Value", score: "Score"); + + // Make sure the prediction error is not too large. + Assert.InRange(metrics.L2, 0, 0.1); + + foreach (var pred in prediction.AsEnumerable(mlContext, false)) + // Test data contains no out-of-range indexes (i.e., all indexes can be found in the training matrix), + // so NaN should never happen. + Assert.True(!float.IsNaN(pred.Score)); + + // Create out-of-range examples and make sure their predictions are all NaN + var invalidTestMatrix = new List() + { + // An example with a matrix column index just greater than the maximum allowed value + new MatrixElementZeroBasedForScore() { MatrixColumnIndex = _synthesizedMatrixFirstColumnIndex + _synthesizedMatrixColumnCount, MatrixRowIndex = _synthesizedMatrixFirstRowIndex, Score = default }, + // An example with a matrix row index just greater than the maximum allowed value + new MatrixElementZeroBasedForScore() { MatrixColumnIndex = _synthesizedMatrixFirstColumnIndex, MatrixRowIndex = _synthesizedMatrixFirstRowIndex + _synthesizedMatrixRowCount, Score = default } + }; + + // Convert the in-memory matrix into an IDataView so that ML.NET components can consume it. + var invalidTestDataView = ComponentCreation.CreateDataView(mlContext, invalidTestMatrix); + + // Apply the trained model to the examples with out-of-range indexes. + var invalidPrediction = model.Transform(invalidTestDataView); + + foreach (var pred in invalidPrediction.AsEnumerable(mlContext, false)) + // The presence of out-of-range indexes may lead to NaN + Assert.True(float.IsNaN(pred.Score)); + } } } \ No newline at end of file