From 9ac648b82d58546ee8ed0bf6fb813d533509d9e7 Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Wed, 31 Oct 2018 14:07:55 -0700 Subject: [PATCH 1/9] Make a test for preventing it from happening again --- .../MatrixFactorizationTests.cs | 89 +++++++++++++++++++ 1 file changed, 89 insertions(+) diff --git a/test/Microsoft.ML.Tests/TrainerEstimators/MatrixFactorizationTests.cs b/test/Microsoft.ML.Tests/TrainerEstimators/MatrixFactorizationTests.cs index 6c3bf1e2c1..cd602b6a35 100644 --- a/test/Microsoft.ML.Tests/TrainerEstimators/MatrixFactorizationTests.cs +++ b/test/Microsoft.ML.Tests/TrainerEstimators/MatrixFactorizationTests.cs @@ -202,5 +202,94 @@ public void MatrixFactorizationInMemoryData() // Native test. Just check the pipeline runs. Assert.True(metrics.L2 < 0.1); } + + internal class MatrixElementZeroBased + { + // Matrix column index starts from 0 and is at most _synthesizedMatrixColumnCount-1. + // Contieuous=true means that all values from 0 to _synthesizedMatrixColumnCount-1 are allowed keys. + [KeyType(Contiguous = true, Count = _synthesizedMatrixColumnCount, Min = 0)] + public uint MatrixColumnIndex; + // Matrix row index starts from 0 and is at most _synthesizedMatrixRowCount-1. + // Contieuous=true means that all values from 0 to _synthesizedMatrixRowCount-1 are allowed keys. + [KeyType(Contiguous = true, Count = _synthesizedMatrixRowCount, Min = 0)] + public uint MatrixRowIndex; + // The value at the MatrixColumnIndex-th column and the MatrixRowIndex-th row in the considered matrix. + public float Value; + } + + internal class MatrixElementZeroBasedForScore + { + // Matrix column index starts from 0 and is at most _synthesizedMatrixColumnCount-1. + // Contieuous=true means that all values from 0 to _synthesizedMatrixColumnCount-1 are allowed keys. + [KeyType(Contiguous = true, Count = _synthesizedMatrixColumnCount, Min = 0)] + public uint MatrixColumnIndex; + // Matrix row index starts from 0 and is at most _synthesizedMatrixRowCount-1. + // Contieuous=true means that all values from 0 to _synthesizedMatrixRowCount-1 are allowed keys. + [KeyType(Contiguous = true, Count = _synthesizedMatrixRowCount, Min = 0)] + public uint MatrixRowIndex; + public float Score; + } + + [ConditionalFact(typeof(Environment), nameof(Environment.Is64BitProcess))] // This test is being fixed as part of issue #1441. + public void MatrixFactorizationInMemoryDataZeroBaseIndex() + { + // Create an in-memory matrix as a list of tuples (column index, row index, value). + // Iterators i and j are column and row indexes, respectively. + var dataMatrix = new List(); + for (uint i = 0; i < _synthesizedMatrixColumnCount; ++i) + for (uint j = 0; j < _synthesizedMatrixRowCount; ++j) + dataMatrix.Add(new MatrixElementZeroBased() { MatrixColumnIndex = i, MatrixRowIndex = j, Value = (i + j) % 5 }); + + // Convert the in-memory matrix into an IDataView so that ML.NET components can consume it. + var dataView = ComponentCreation.CreateDataView(Env, dataMatrix); + + // Create a matrix factorization trainer which may consume "Value" as the training label, "MatrixColumnIndex" as the + // matrix's column index, and "MatrixRowIndex" as the matrix's row index. + var mlContext = new MLContext(seed: 1, conc: 1); + var pipeline = new MatrixFactorizationTrainer(mlContext, nameof(MatrixElementZeroBased.Value), + nameof(MatrixElementZeroBased.MatrixColumnIndex), nameof(MatrixElementZeroBased.MatrixRowIndex), + advancedSettings: s => + { + s.NumIterations = 10; + s.NumThreads = 1; // To eliminate randomness, # of threads must be 1. + s.K = 32; + }); + + // Train a matrix factorization model. + var model = pipeline.Fit(dataView); + + // Check if the expected types in the trained model are expected. + Assert.True(model.MatrixColumnIndexColumnName == nameof(MatrixElementZeroBased.MatrixColumnIndex)); + Assert.True(model.MatrixRowIndexColumnName == nameof(MatrixElementZeroBased.MatrixRowIndex)); + Assert.True(model.MatrixColumnIndexColumnType.IsKey); + Assert.True(model.MatrixRowIndexColumnType.IsKey); + var matColKeyType = model.MatrixColumnIndexColumnType.AsKey; + Assert.True(matColKeyType.Min == 0); + Assert.True(matColKeyType.Count == _synthesizedMatrixColumnCount); + var matRowKeyType = model.MatrixRowIndexColumnType.AsKey; + Assert.True(matRowKeyType.Min == 0); + Assert.True(matRowKeyType.Count == _synthesizedMatrixRowCount); + + // Create matrix examples for testing prediction + var testMatrix = new List(); + for (uint i = 0; i < _synthesizedMatrixColumnCount; ++i) + for (uint j = 0; j < _synthesizedMatrixRowCount; ++j) + testMatrix.Add(new MatrixElementZeroBasedForScore() { MatrixColumnIndex = i, MatrixRowIndex = j, Score = default }); + + // Convert the in-memory matrix into an IDataView so that ML.NET components can consume it. + var testDataView = ComponentCreation.CreateDataView(Env, testMatrix); + + // Apply the trained model to the training set + var prediction = model.Transform(dataView); + + foreach (var pred in prediction.AsEnumerable(mlContext, false)) + Assert.True(!float.IsNaN(pred.Score)); + + // Calculate regression matrices for the prediction result + var metrics = mlContext.Regression.Evaluate(prediction, label: "Value", score: "Score"); + + // Native test. Just check the pipeline runs. + Assert.True(metrics.L2 < 0.1); + } } } \ No newline at end of file From 0a434eb2d91d55f2f33062161c679c0733834adc Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Thu, 1 Nov 2018 18:49:42 -0700 Subject: [PATCH 2/9] Draft of making 0-based working for MF --- .../DataViewConstructionUtils.cs | 70 ++++++++++++ .../MatrixFactorizationPredictor.cs | 8 +- .../MatrixFactorizationTrainer.cs | 107 +++++++++++------- .../MatrixFactorizationTests.cs | 41 ++++--- 4 files changed, 165 insertions(+), 61 deletions(-) diff --git a/src/Microsoft.ML.Api/DataViewConstructionUtils.cs b/src/Microsoft.ML.Api/DataViewConstructionUtils.cs index 612be99faf..da3883e40f 100644 --- a/src/Microsoft.ML.Api/DataViewConstructionUtils.cs +++ b/src/Microsoft.ML.Api/DataViewConstructionUtils.cs @@ -208,6 +208,69 @@ private Delegate CreateGetter(ColumnType colType, InternalSchemaDefinition.Colum else Host.Assert(colType.RawType == outputType); del = CreateDirectGetterDelegate; + + if (colType.IsKey) + { + var keyRawType = colType.RawType; + Host.Assert(colType.AsKey.Contiguous); + if (keyRawType == typeof(uint)) + { + ValueGetter rawGetter = CreateDirectGetter(peek); + uint rawKeyValue = 0; + uint min = (uint)colType.AsKey.Min; + uint max = min + (uint)colType.AsKey.Count - 1; + ValueGetter getter = (ref uint dst) => + { + rawGetter(ref rawKeyValue); + dst = rawKeyValue - min + 1; + }; + return getter; + } + else if (keyRawType == typeof(byte)) + { + ValueGetter rawGetter = CreateDirectGetter(peek); + byte rawKeyValue = 0; + uint min = (uint)colType.AsKey.Min; + uint max = min + (uint)colType.AsKey.Count - 1; + ValueGetter getter = (ref uint dst) => + { + rawGetter(ref rawKeyValue); + // U2 (byte) input, rawKeyValue, gets converted to uint eventually. + dst = rawKeyValue - min + 1; + }; + return getter; + } + else if (keyRawType == typeof(bool)) + { + ValueGetter rawGetter = CreateDirectGetter(peek); + bool rawKeyValue = false; + uint min = (uint)colType.AsKey.Min; + uint max = min + (uint)colType.AsKey.Count - 1; + ValueGetter getter = (ref uint dst) => + { + rawGetter(ref rawKeyValue); + // U1 (byte) input, rawKeyValue, gets converted to uint eventually. + if (rawKeyValue) + dst = 1 - min + 1; + else + dst = 0 - min + 1; + }; + return getter; + } + else if (keyRawType == typeof(ulong)) + { + ValueGetter rawGetter = CreateDirectGetter(peek); + ulong rawKeyValue = 0; + ulong min = colType.AsKey.Min; + ulong max = min + (ulong)colType.AsKey.Count - 1; + ValueGetter getter = (ref ulong dst) => + { + rawGetter(ref rawKeyValue); + dst = rawKeyValue - min + 1; + }; + return getter; + } + } } else { @@ -288,6 +351,13 @@ private Delegate CreateDirectGetterDelegate(Delegate peekDel) peek(GetCurrentRowObject(), Position, ref dst)); } + private ValueGetter CreateDirectGetter(Delegate peekDel) + { + var peek = peekDel as Peek; + Host.AssertValue(peek); + return (ref TDst dst) => peek(GetCurrentRowObject(), Position, ref dst); + } + protected abstract TRow GetCurrentRowObject(); public bool IsColumnActive(int col) diff --git a/src/Microsoft.ML.Recommender/MatrixFactorizationPredictor.cs b/src/Microsoft.ML.Recommender/MatrixFactorizationPredictor.cs index 44da79c703..6b71a165c6 100644 --- a/src/Microsoft.ML.Recommender/MatrixFactorizationPredictor.cs +++ b/src/Microsoft.ML.Recommender/MatrixFactorizationPredictor.cs @@ -13,7 +13,6 @@ using Microsoft.ML.Runtime.Model; using Microsoft.ML.Runtime.Recommender; using Microsoft.ML.Runtime.Recommender.Internal; -using Microsoft.ML.Trainers; using Microsoft.ML.Trainers.Recommender; [assembly: LoadableClass(typeof(MatrixFactorizationPredictor), null, typeof(SignatureLoadModel), "Matrix Factorization Predictor Executor", MatrixFactorizationPredictor.LoaderSignature)] @@ -347,9 +346,12 @@ private Delegate[] CreateGetter(IRow input, bool[] active) var getters = new Delegate[1]; if (active[0]) { + // First check if expected columns are ok and then create getters to acccess those columns' values. CheckInputSchema(input.Schema, _matrixColumnIndexColumnIndex, _matrixRowIndexCololumnIndex); - var matrixColumnIndexGetter = input.GetGetter(_matrixColumnIndexColumnIndex); - var matrixRowIndexGetter = input.GetGetter(_matrixRowIndexCololumnIndex); + var matrixColumnIndexGetter = RowCursorUtils.GetGetterAs(NumberType.U4, input, _matrixColumnIndexColumnIndex); + var matrixRowIndexGetter = RowCursorUtils.GetGetterAs(NumberType.U4, input, _matrixRowIndexCololumnIndex); + + // Assign the getter of the prediction score. It maps a pair of matrix column index and matrix row index to a scalar. getters[0] = _parent.GetGetter(matrixColumnIndexGetter, matrixRowIndexGetter); } return getters; diff --git a/src/Microsoft.ML.Recommender/MatrixFactorizationTrainer.cs b/src/Microsoft.ML.Recommender/MatrixFactorizationTrainer.cs index 5d32a4fd66..784dabba64 100644 --- a/src/Microsoft.ML.Recommender/MatrixFactorizationTrainer.cs +++ b/src/Microsoft.ML.Recommender/MatrixFactorizationTrainer.cs @@ -29,44 +29,50 @@ public sealed class MatrixFactorizationTrainer : TrainerBase - /// The row, column, and label columns that the trainer expects. This module uses tuples of (row index, column index, label value) to specify a matrix. + /// The row index, column index, and label columns needed to specify the training matrix. This trainer uses tuples of (row index, column index, label value) to specify a matrix. /// For example, a 2-by-2 matrix /// [9, 4] /// [8, 7] /// can be encoded as tuples (0, 0, 9), (0, 1, 4), (1, 0, 8), and (1, 1, 7). It means that the row/column/label column contains [0, 0, 1, 1]/ /// [0, 1, 0, 1]/[9, 4, 8, 7]. /// - public readonly SchemaShape.Column MatrixColumnIndexColumn; // column indices of the training matrix - public readonly SchemaShape.Column MatrixRowIndexColumn; // row indices of the training matrix - public readonly SchemaShape.Column LabelColumn; + + /// + /// The name of variable (i.e., Column in a type system) used be used as matrix's column index. + /// + public readonly string MatrixColumnIndexName; + + /// + /// The name of variable (i.e., column in a type system) used as matrix's row index. + /// + public readonly string MatrixRowIndexName; + + /// + /// The name variable (i.e., column in a type system) used as matrix's element value. + /// + public readonly string LabelName; /// /// The contains general parameters for this trainer. @@ -95,7 +113,7 @@ public sealed class Arguments /// Extra information the trainer can use. For example, its validation set (if not null) can be use to evaluate the /// training progress made at each training iteration. /// - public readonly TrainerEstimatorContext Context; + private readonly TrainerEstimatorContext _context; /// /// Legacy constructor initializing a new instance of through the legacy @@ -149,11 +167,11 @@ public MatrixFactorizationTrainer(IHostEnvironment env, string labelColumn, stri _doNmf = args.NonNegative; Info = new TrainerInfo(normalization: false, caching: false); - Context = context; + _context = context; - LabelColumn = new SchemaShape.Column(labelColumn, SchemaShape.Column.VectorKind.Scalar, NumberType.R4, false); - MatrixColumnIndexColumn = new SchemaShape.Column(matrixColumnIndexColumnName, SchemaShape.Column.VectorKind.Scalar, NumberType.U4, true); - MatrixRowIndexColumn = new SchemaShape.Column(matrixRowIndexColumnName, SchemaShape.Column.VectorKind.Scalar, NumberType.U4, true); + LabelName = labelColumn; + MatrixColumnIndexName = matrixColumnIndexColumnName; + MatrixRowIndexName = matrixRowIndexColumnName; } /// @@ -210,22 +228,21 @@ private MatrixFactorizationPredictor TrainCore(IChannel ch, RoleMappedData data, int rowCount = matrixRowIndexColInfo.Type.KeyCount; ch.Assert(rowCount > 0); ch.Assert(colCount > 0); - // Checks for equality on the validation set ensure it is correct here. + // Checks for equality on the validation set ensure it is correct here. using (var cursor = data.Data.GetRowCursor(c => c == matrixColumnIndexColInfo.Index || c == matrixRowIndexColInfo.Index || c == data.Schema.Label.Index)) { // LibMF works only over single precision floats, but we want to be able to consume either. - ValueGetter labGetter = RowCursorUtils.GetGetterAs(NumberType.R4, cursor, data.Schema.Label.Index); - var matrixColumnIndexGetter = cursor.GetGetter(matrixColumnIndexColInfo.Index); - var matrixRowIndexGetter = cursor.GetGetter(matrixRowIndexColInfo.Index); + var labGetter = RowCursorUtils.GetGetterAs(NumberType.R4, cursor, data.Schema.Label.Index); + var matrixColumnIndexGetter = RowCursorUtils.GetGetterAs(NumberType.U4, cursor, matrixColumnIndexColInfo.Index); + var matrixRowIndexGetter = RowCursorUtils.GetGetterAs(NumberType.U4, cursor, matrixRowIndexColInfo.Index); if (validData == null) { // Have the trainer do its work. using (var buffer = PrepareBuffer()) { - buffer.Train(ch, rowCount, colCount, - cursor, labGetter, matrixRowIndexGetter, matrixColumnIndexGetter); + buffer.Train(ch, rowCount, colCount, cursor, labGetter, matrixRowIndexGetter, matrixColumnIndexGetter); predictor = new MatrixFactorizationPredictor(Host, buffer, matrixColumnIndexColInfo.Type.AsKey, matrixRowIndexColInfo.Type.AsKey); } } @@ -234,16 +251,16 @@ private MatrixFactorizationPredictor TrainCore(IChannel ch, RoleMappedData data, using (var validCursor = validData.Data.GetRowCursor( c => c == validMatrixColumnIndexColInfo.Index || c == validMatrixRowIndexColInfo.Index || c == validData.Schema.Label.Index)) { - ValueGetter validLabGetter = RowCursorUtils.GetGetterAs(NumberType.R4, validCursor, validData.Schema.Label.Index); - var validXGetter = validCursor.GetGetter(validMatrixColumnIndexColInfo.Index); - var validYGetter = validCursor.GetGetter(validMatrixRowIndexColInfo.Index); + ValueGetter validLabelGetter = RowCursorUtils.GetGetterAs(NumberType.R4, validCursor, validData.Schema.Label.Index); + var validMatrixColumnIndexGetter = RowCursorUtils.GetGetterAs(NumberType.U4, validCursor, validMatrixColumnIndexColInfo.Index); + var validMatrixRowIndexGetter = RowCursorUtils.GetGetterAs(NumberType.U4, validCursor, validMatrixRowIndexColInfo.Index); // Have the trainer do its work. using (var buffer = PrepareBuffer()) { buffer.TrainWithValidation(ch, rowCount, colCount, cursor, labGetter, matrixRowIndexGetter, matrixColumnIndexGetter, - validCursor, validLabGetter, validYGetter, validXGetter); + validCursor, validLabelGetter, validMatrixRowIndexGetter, validMatrixColumnIndexGetter); predictor = new MatrixFactorizationPredictor(Host, buffer, matrixColumnIndexColInfo.Type.AsKey, matrixRowIndexColInfo.Type.AsKey); } } @@ -268,12 +285,12 @@ public MatrixFactorizationPredictionTransformer Fit(IDataView input) MatrixFactorizationPredictor model = null; var roles = new List>(); - roles.Add(new KeyValuePair(RoleMappedSchema.ColumnRole.Label, LabelColumn.Name)); - roles.Add(new KeyValuePair(RecommenderUtils.MatrixColumnIndexKind.Value, MatrixColumnIndexColumn.Name)); - roles.Add(new KeyValuePair(RecommenderUtils.MatrixRowIndexKind.Value, MatrixRowIndexColumn.Name)); + roles.Add(new KeyValuePair(RoleMappedSchema.ColumnRole.Label, LabelName)); + roles.Add(new KeyValuePair(RecommenderUtils.MatrixColumnIndexKind.Value, MatrixColumnIndexName)); + roles.Add(new KeyValuePair(RecommenderUtils.MatrixRowIndexKind.Value, MatrixRowIndexName)); var trainingData = new RoleMappedData(input, roles); - var validData = Context == null ? null : new RoleMappedData(Context.ValidationSet, roles); + var validData = _context == null ? null : new RoleMappedData(_context.ValidationSet, roles); using (var ch = Host.Start("Training")) using (var pch = Host.StartProgressChannel("Training")) @@ -281,7 +298,7 @@ public MatrixFactorizationPredictionTransformer Fit(IDataView input) model = TrainCore(ch, trainingData, validData); } - return new MatrixFactorizationPredictionTransformer(Host, model, input.Schema, MatrixColumnIndexColumn.Name, MatrixRowIndexColumn.Name); + return new MatrixFactorizationPredictionTransformer(Host, model, input.Schema, MatrixColumnIndexName, MatrixRowIndexName); } public SchemaShape GetOutputSchema(SchemaShape inputSchema) @@ -297,13 +314,15 @@ void CheckColumnsCompatible(SchemaShape.Column cachedColumn, string expectedColu throw Host.Except($"{expectedColumnName} column '{cachedColumn.Name}' is not compatible"); } - // In prediction phase, no label column is expected. - if (LabelColumn != null) - CheckColumnsCompatible(LabelColumn, LabelColumn.Name); + // Check if label column is good. + var labelColumn = new SchemaShape.Column(LabelName, SchemaShape.Column.VectorKind.Scalar, NumberType.R4, false); + CheckColumnsCompatible(labelColumn, LabelName); - // In both of training and prediction phases, we need columns of user ID and column ID. - CheckColumnsCompatible(MatrixColumnIndexColumn, MatrixColumnIndexColumn.Name); - CheckColumnsCompatible(MatrixRowIndexColumn, MatrixRowIndexColumn.Name); + // Check if columns of matrix's row and column indexes are good. Note that column of IDataView and column of matrix are two different things. + var matrixColumnIndexColumn = new SchemaShape.Column(MatrixColumnIndexName, SchemaShape.Column.VectorKind.Scalar, NumberType.U4, true); + var matrixRowIndexColumn = new SchemaShape.Column(MatrixRowIndexName, SchemaShape.Column.VectorKind.Scalar, NumberType.U4, true); + CheckColumnsCompatible(matrixColumnIndexColumn, MatrixColumnIndexName); + CheckColumnsCompatible(matrixRowIndexColumn, MatrixRowIndexName); // Input columns just pass through so that output column dictionary contains all input columns. var outColumns = inputSchema.Columns.ToDictionary(x => x.Name); @@ -317,7 +336,7 @@ void CheckColumnsCompatible(SchemaShape.Column cachedColumn, string expectedColu private SchemaShape.Column[] GetOutputColumnsCore(SchemaShape inputSchema) { - bool success = inputSchema.TryFindColumn(LabelColumn.Name, out var labelCol); + bool success = inputSchema.TryFindColumn(LabelName, out var labelCol); Contracts.Assert(success); return new[] diff --git a/test/Microsoft.ML.Tests/TrainerEstimators/MatrixFactorizationTests.cs b/test/Microsoft.ML.Tests/TrainerEstimators/MatrixFactorizationTests.cs index cd602b6a35..0c612c6a3f 100644 --- a/test/Microsoft.ML.Tests/TrainerEstimators/MatrixFactorizationTests.cs +++ b/test/Microsoft.ML.Tests/TrainerEstimators/MatrixFactorizationTests.cs @@ -250,9 +250,10 @@ public void MatrixFactorizationInMemoryDataZeroBaseIndex() nameof(MatrixElementZeroBased.MatrixColumnIndex), nameof(MatrixElementZeroBased.MatrixRowIndex), advancedSettings: s => { - s.NumIterations = 10; + s.NumIterations = 100; s.NumThreads = 1; // To eliminate randomness, # of threads must be 1. s.K = 32; + s.Eta = 0.5; }); // Train a matrix factorization model. @@ -270,26 +271,38 @@ public void MatrixFactorizationInMemoryDataZeroBaseIndex() Assert.True(matRowKeyType.Min == 0); Assert.True(matRowKeyType.Count == _synthesizedMatrixRowCount); - // Create matrix examples for testing prediction - var testMatrix = new List(); - for (uint i = 0; i < _synthesizedMatrixColumnCount; ++i) - for (uint j = 0; j < _synthesizedMatrixRowCount; ++j) - testMatrix.Add(new MatrixElementZeroBasedForScore() { MatrixColumnIndex = i, MatrixRowIndex = j, Score = default }); - - // Convert the in-memory matrix into an IDataView so that ML.NET components can consume it. - var testDataView = ComponentCreation.CreateDataView(Env, testMatrix); - // Apply the trained model to the training set var prediction = model.Transform(dataView); + // Calculate regression matrices for the prediction result. It's a global + var metrics = mlContext.Regression.Evaluate(prediction, label: "Value", score: "Score"); + + // Make sure the prediction error is not too large. + Assert.InRange(metrics.L2, 0, 0.1); + foreach (var pred in prediction.AsEnumerable(mlContext, false)) + // Test data contains no out-of-range indexes (i.e., all indexes can be found in the training matrix), + // so NaN should never happen. Assert.True(!float.IsNaN(pred.Score)); - // Calculate regression matrices for the prediction result - var metrics = mlContext.Regression.Evaluate(prediction, label: "Value", score: "Score"); + // Create out-of-range examples and make sure their predictions are all NaN + var invalidTestMatrix = new List() + { + // An example with a matrix column index just greater than the maximum allowed value + new MatrixElementZeroBasedForScore() { MatrixColumnIndex = _synthesizedMatrixFirstColumnIndex + _synthesizedMatrixColumnCount, MatrixRowIndex = _synthesizedMatrixFirstRowIndex, Score = default }, + // An example with a matrix row index just greater than the maximum allowed value + new MatrixElementZeroBasedForScore() { MatrixColumnIndex = _synthesizedMatrixFirstColumnIndex, MatrixRowIndex = _synthesizedMatrixFirstRowIndex + _synthesizedMatrixRowCount, Score = default } + }; - // Native test. Just check the pipeline runs. - Assert.True(metrics.L2 < 0.1); + // Convert the in-memory matrix into an IDataView so that ML.NET components can consume it. + var invalidTestDataView = ComponentCreation.CreateDataView(mlContext, invalidTestMatrix); + + // Apply the trained model to the examples with out-of-range indexes. + var invalidPrediction = model.Transform(invalidTestDataView); + + foreach (var pred in invalidPrediction.AsEnumerable(mlContext, false)) + // The presence of out-of-range indexes may lead to NaN + Assert.True(float.IsNaN(pred.Score)); } } } \ No newline at end of file From 68ab135367761a4ad8bbf5479b54f1c7a381e52c Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Thu, 1 Nov 2018 19:47:13 -0700 Subject: [PATCH 3/9] Out-of-range leads to 0 --- .../DataViewConstructionUtils.cs | 46 +++++++++++++------ 1 file changed, 32 insertions(+), 14 deletions(-) diff --git a/src/Microsoft.ML.Api/DataViewConstructionUtils.cs b/src/Microsoft.ML.Api/DataViewConstructionUtils.cs index da3883e40f..df02a5ca0f 100644 --- a/src/Microsoft.ML.Api/DataViewConstructionUtils.cs +++ b/src/Microsoft.ML.Api/DataViewConstructionUtils.cs @@ -217,12 +217,15 @@ private Delegate CreateGetter(ColumnType colType, InternalSchemaDefinition.Colum { ValueGetter rawGetter = CreateDirectGetter(peek); uint rawKeyValue = 0; - uint min = (uint)colType.AsKey.Min; - uint max = min + (uint)colType.AsKey.Count - 1; + ulong min = colType.AsKey.Min; + ulong max = min + (ulong)colType.AsKey.Count - 1; ValueGetter getter = (ref uint dst) => { rawGetter(ref rawKeyValue); - dst = rawKeyValue - min + 1; + if (min <= rawKeyValue && rawKeyValue <= max) + dst = (uint)(rawKeyValue - min + 1); + else + dst = 0; }; return getter; } @@ -230,13 +233,16 @@ private Delegate CreateGetter(ColumnType colType, InternalSchemaDefinition.Colum { ValueGetter rawGetter = CreateDirectGetter(peek); byte rawKeyValue = 0; - uint min = (uint)colType.AsKey.Min; - uint max = min + (uint)colType.AsKey.Count - 1; - ValueGetter getter = (ref uint dst) => + ulong min = colType.AsKey.Min; + ulong max = min + (ulong)colType.AsKey.Count + 1; + ValueGetter getter = (ref byte dst) => { rawGetter(ref rawKeyValue); // U2 (byte) input, rawKeyValue, gets converted to uint eventually. - dst = rawKeyValue - min + 1; + if (min <= rawKeyValue && rawKeyValue <= max) + dst = (byte)(rawKeyValue - min + 1); + else + dst = 0; }; return getter; } @@ -244,16 +250,23 @@ private Delegate CreateGetter(ColumnType colType, InternalSchemaDefinition.Colum { ValueGetter rawGetter = CreateDirectGetter(peek); bool rawKeyValue = false; - uint min = (uint)colType.AsKey.Min; - uint max = min + (uint)colType.AsKey.Count - 1; - ValueGetter getter = (ref uint dst) => + ulong tmp = 0; + ulong min = colType.AsKey.Min; + ulong max = min + (ulong)colType.AsKey.Count - 1; + ValueGetter getter = (ref bool dst) => { rawGetter(ref rawKeyValue); // U1 (byte) input, rawKeyValue, gets converted to uint eventually. - if (rawKeyValue) - dst = 1 - min + 1; + tmp = rawKeyValue ? 1UL : 0UL; + if (min <= tmp && tmp <= max) + if (rawKeyValue) + // rawKeyValue is true, which is 1. + dst = (1 - min + 1) > 0; + else + // rawKeyValue is false, which is 0. + dst = (0 - min + 1) > 0; else - dst = 0 - min + 1; + dst = false; }; return getter; } @@ -266,10 +279,15 @@ private Delegate CreateGetter(ColumnType colType, InternalSchemaDefinition.Colum ValueGetter getter = (ref ulong dst) => { rawGetter(ref rawKeyValue); - dst = rawKeyValue - min + 1; + if (min <= rawKeyValue && rawKeyValue <= max) + dst = rawKeyValue - min + 1; + else + dst = 0; }; return getter; } + else + throw Host.ExceptNotSupp("Key type '{0}' is not yet supported.", keyRawType.Name); } } else From 2bd4c434d6913ca26a59397588b2657df9d3ee29 Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Fri, 2 Nov 2018 11:35:43 -0700 Subject: [PATCH 4/9] Switch to generic-typed solution --- .../DataViewConstructionUtils.cs | 107 +++++------------- 1 file changed, 30 insertions(+), 77 deletions(-) diff --git a/src/Microsoft.ML.Api/DataViewConstructionUtils.cs b/src/Microsoft.ML.Api/DataViewConstructionUtils.cs index df02a5ca0f..aef98869d1 100644 --- a/src/Microsoft.ML.Api/DataViewConstructionUtils.cs +++ b/src/Microsoft.ML.Api/DataViewConstructionUtils.cs @@ -207,87 +207,15 @@ private Delegate CreateGetter(ColumnType colType, InternalSchemaDefinition.Colum Host.Assert(colType.RawType == Nullable.GetUnderlyingType(outputType)); else Host.Assert(colType.RawType == outputType); - del = CreateDirectGetterDelegate; - if (colType.IsKey) + if (!colType.IsKey) + del = CreateDirectGetterDelegate; + else { var keyRawType = colType.RawType; Host.Assert(colType.AsKey.Contiguous); - if (keyRawType == typeof(uint)) - { - ValueGetter rawGetter = CreateDirectGetter(peek); - uint rawKeyValue = 0; - ulong min = colType.AsKey.Min; - ulong max = min + (ulong)colType.AsKey.Count - 1; - ValueGetter getter = (ref uint dst) => - { - rawGetter(ref rawKeyValue); - if (min <= rawKeyValue && rawKeyValue <= max) - dst = (uint)(rawKeyValue - min + 1); - else - dst = 0; - }; - return getter; - } - else if (keyRawType == typeof(byte)) - { - ValueGetter rawGetter = CreateDirectGetter(peek); - byte rawKeyValue = 0; - ulong min = colType.AsKey.Min; - ulong max = min + (ulong)colType.AsKey.Count + 1; - ValueGetter getter = (ref byte dst) => - { - rawGetter(ref rawKeyValue); - // U2 (byte) input, rawKeyValue, gets converted to uint eventually. - if (min <= rawKeyValue && rawKeyValue <= max) - dst = (byte)(rawKeyValue - min + 1); - else - dst = 0; - }; - return getter; - } - else if (keyRawType == typeof(bool)) - { - ValueGetter rawGetter = CreateDirectGetter(peek); - bool rawKeyValue = false; - ulong tmp = 0; - ulong min = colType.AsKey.Min; - ulong max = min + (ulong)colType.AsKey.Count - 1; - ValueGetter getter = (ref bool dst) => - { - rawGetter(ref rawKeyValue); - // U1 (byte) input, rawKeyValue, gets converted to uint eventually. - tmp = rawKeyValue ? 1UL : 0UL; - if (min <= tmp && tmp <= max) - if (rawKeyValue) - // rawKeyValue is true, which is 1. - dst = (1 - min + 1) > 0; - else - // rawKeyValue is false, which is 0. - dst = (0 - min + 1) > 0; - else - dst = false; - }; - return getter; - } - else if (keyRawType == typeof(ulong)) - { - ValueGetter rawGetter = CreateDirectGetter(peek); - ulong rawKeyValue = 0; - ulong min = colType.AsKey.Min; - ulong max = min + (ulong)colType.AsKey.Count - 1; - ValueGetter getter = (ref ulong dst) => - { - rawGetter(ref rawKeyValue); - if (min <= rawKeyValue && rawKeyValue <= max) - dst = rawKeyValue - min + 1; - else - dst = 0; - }; - return getter; - } - else - throw Host.ExceptNotSupp("Key type '{0}' is not yet supported.", keyRawType.Name); + Func delForKey = CreateKeyGetterDelegate; + return Utils.MarshalInvoke(delForKey, keyRawType, peek, colType); } } else @@ -376,6 +304,31 @@ private ValueGetter CreateDirectGetter(Delegate peekDel) return (ref TDst dst) => peek(GetCurrentRowObject(), Position, ref dst); } + private Delegate CreateKeyGetterDelegate(Delegate peekDel, ColumnType colType) + { + // Convert delegate function to a function which can fetch the underlying value. + var peek = peekDel as Peek; + Host.AssertValue(peek); + Host.Check(colType.IsKey); + + TDst rawKeyValue = default; + ulong key = 0; // the key value as ulong + ulong min = colType.AsKey.Min; + ulong max = min + (ulong)colType.AsKey.Count - 1; + ulong result = 0; // the result as ulong + ValueGetter getter = (ref TDst dst) => + { + peek(GetCurrentRowObject(), Position, ref rawKeyValue); + key = (ulong)Convert.ChangeType(rawKeyValue, typeof(ulong)); + if (min <= key && key <= max) + result = key - min + 1; + else + result = 0; + dst = (TDst)Convert.ChangeType(result, typeof(TDst)); + }; + return getter; + } + protected abstract TRow GetCurrentRowObject(); public bool IsColumnActive(int col) From 37acbb0ce7d7a42eb4af99169ab0b185fe4dfc1a Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Fri, 2 Nov 2018 11:42:41 -0700 Subject: [PATCH 5/9] Add checks --- src/Microsoft.ML.Api/DataViewConstructionUtils.cs | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/Microsoft.ML.Api/DataViewConstructionUtils.cs b/src/Microsoft.ML.Api/DataViewConstructionUtils.cs index aef98869d1..cd194c0bb0 100644 --- a/src/Microsoft.ML.Api/DataViewConstructionUtils.cs +++ b/src/Microsoft.ML.Api/DataViewConstructionUtils.cs @@ -306,13 +306,20 @@ private ValueGetter CreateDirectGetter(Delegate peekDel) private Delegate CreateKeyGetterDelegate(Delegate peekDel, ColumnType colType) { + // Make sure the function is dealing with key. + Host.Check(colType.IsKey); + // Following equations work only with contiguous key type. + Host.Check(colType.AsKey.Contiguous); + // Following equations work only with unsigned integers. + Host.Check(typeof(TDst) == typeof(ulong) || typeof(TDst) == typeof(uint) || + typeof(TDst) == typeof(byte) || typeof(TDst) == typeof(bool)); + // Convert delegate function to a function which can fetch the underlying value. var peek = peekDel as Peek; Host.AssertValue(peek); - Host.Check(colType.IsKey); TDst rawKeyValue = default; - ulong key = 0; // the key value as ulong + ulong key = 0; // the raw key value as ulong ulong min = colType.AsKey.Min; ulong max = min + (ulong)colType.AsKey.Count - 1; ulong result = 0; // the result as ulong From d415fa337ef4aa15574cc25c6d1b238ea01f1f21 Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Fri, 2 Nov 2018 11:44:30 -0700 Subject: [PATCH 6/9] Remove unused function --- src/Microsoft.ML.Api/DataViewConstructionUtils.cs | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/Microsoft.ML.Api/DataViewConstructionUtils.cs b/src/Microsoft.ML.Api/DataViewConstructionUtils.cs index cd194c0bb0..a4bc8cfda5 100644 --- a/src/Microsoft.ML.Api/DataViewConstructionUtils.cs +++ b/src/Microsoft.ML.Api/DataViewConstructionUtils.cs @@ -297,13 +297,6 @@ private Delegate CreateDirectGetterDelegate(Delegate peekDel) peek(GetCurrentRowObject(), Position, ref dst)); } - private ValueGetter CreateDirectGetter(Delegate peekDel) - { - var peek = peekDel as Peek; - Host.AssertValue(peek); - return (ref TDst dst) => peek(GetCurrentRowObject(), Position, ref dst); - } - private Delegate CreateKeyGetterDelegate(Delegate peekDel, ColumnType colType) { // Make sure the function is dealing with key. From d6d47dc2f2916071df7dc38871dd0ccc9ddfcf59 Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Mon, 5 Nov 2018 08:27:58 -0800 Subject: [PATCH 7/9] Add space between two tokens --- .../MatrixFactorizationTrainer.cs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/Microsoft.ML.Recommender/MatrixFactorizationTrainer.cs b/src/Microsoft.ML.Recommender/MatrixFactorizationTrainer.cs index 784dabba64..7684e4579b 100644 --- a/src/Microsoft.ML.Recommender/MatrixFactorizationTrainer.cs +++ b/src/Microsoft.ML.Recommender/MatrixFactorizationTrainer.cs @@ -29,15 +29,15 @@ public sealed class MatrixFactorizationTrainer : TrainerBase Date: Mon, 5 Nov 2018 08:30:17 -0800 Subject: [PATCH 8/9] Fix a typo --- src/Microsoft.ML.Recommender/MatrixFactorizationTrainer.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Microsoft.ML.Recommender/MatrixFactorizationTrainer.cs b/src/Microsoft.ML.Recommender/MatrixFactorizationTrainer.cs index 7684e4579b..27a411f2e2 100644 --- a/src/Microsoft.ML.Recommender/MatrixFactorizationTrainer.cs +++ b/src/Microsoft.ML.Recommender/MatrixFactorizationTrainer.cs @@ -90,7 +90,7 @@ public sealed class Arguments /// /// - /// The name of variable (i.e., Column in a type system) used be used as matrix's column index. + /// The name of variable (i.e., Column in a type system) used be as matrix's column index. /// public readonly string MatrixColumnIndexName; From b3101f6b1a7d54210fe288ca99d3e2cd2f833982 Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Mon, 5 Nov 2018 08:33:55 -0800 Subject: [PATCH 9/9] Fix another typo --- src/Microsoft.ML.Recommender/MatrixFactorizationTrainer.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Microsoft.ML.Recommender/MatrixFactorizationTrainer.cs b/src/Microsoft.ML.Recommender/MatrixFactorizationTrainer.cs index 27a411f2e2..6750f98068 100644 --- a/src/Microsoft.ML.Recommender/MatrixFactorizationTrainer.cs +++ b/src/Microsoft.ML.Recommender/MatrixFactorizationTrainer.cs @@ -38,7 +38,7 @@ public sealed class Arguments [Argument(ArgumentType.AtMostOnce, HelpText = "Latent space dimension (denoted by k). If the factorized matrix is m-by-n, " + "two factor matrices found by matrix factorization are m-by-k and k-by-n, respectively. " + - "This value is also known as the rank of matrix factorization.")] + "This value is also known as the rank of matrix factorization because k is generally much smaller than m and n.")] [TGUI(SuggestedSweeps = "8,16,64,128")] [TlcModule.SweepableDiscreteParam("K", new object[] { 8, 16, 64, 128 })] public int K = 8; @@ -49,7 +49,7 @@ public sealed class Arguments public int NumIterations = 20; [Argument(ArgumentType.AtMostOnce, HelpText = "Initial learning rate. It specifies the speed of the training algorithm. " + - "Small value may increate the number of iterations needed to achieve a reasonable result. Large value may lead to numerical difficulty such as a infinity value.")] + "Small value may increase the number of iterations needed to achieve a reasonable result. Large value may lead to numerical difficulty such as a infinity value.")] [TGUI(SuggestedSweeps = "0.001,0.01,0.1")] [TlcModule.SweepableDiscreteParam("Eta", new object[] { 0.001f, 0.01f, 0.1f })] public double Eta = 0.1;