From 47b757a9f93ee2fac74e06f4a9efc25627d1bd72 Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Thu, 24 Jan 2019 11:18:14 -0800 Subject: [PATCH 01/11] Added support for loading map from file through dataview. --- .../Transforms/ValueMappingTransformer.cs | 51 ++++++++++++++++++- .../Transformers/ValueMappingTests.cs | 40 +++++++++++++++ 2 files changed, 90 insertions(+), 1 deletion(-) diff --git a/src/Microsoft.ML.Data/Transforms/ValueMappingTransformer.cs b/src/Microsoft.ML.Data/Transforms/ValueMappingTransformer.cs index 3360f28904..d8fc647126 100644 --- a/src/Microsoft.ML.Data/Transforms/ValueMappingTransformer.cs +++ b/src/Microsoft.ML.Data/Transforms/ValueMappingTransformer.cs @@ -32,6 +32,55 @@ namespace Microsoft.ML.Transforms.Conversions { + /// + /// The ValueMappingEstimator is a 1-1 mapping from a key to value. This particular class load the mappings from an . + /// This gives user the flexibility to load the mapping from file instead of using IEnumerable in + /// + public sealed class ValueMappingEstimator : TrivialEstimator + { + private readonly (string input, string output)[] _columns; + + /// + /// Constructs the ValueMappingEstimator, key type -> value type mapping + /// + /// The environment to use. + /// An instance of that contains the key and value columns. + /// Name of the key column in . + /// Name of the value column in . + /// The list of columns to apply. + public ValueMappingEstimator(IHostEnvironment env, IDataView lookupMap, string keyColumn, string valueColumn, params (string input, string output)[] columns) + : base(Contracts.CheckRef(env, nameof(env)).Register(nameof(ValueMappingEstimator)), + new ValueMappingTransformer(env, lookupMap, keyColumn, valueColumn, columns)) + { + _columns = columns; + } + + public override SchemaShape GetOutputSchema(SchemaShape inputSchema) + { + Host.CheckValue(inputSchema, nameof(inputSchema)); + + var resultDic = inputSchema.ToDictionary(x => x.Name); + var vectorKind = Transformer.ValueColumnType is VectorType ? SchemaShape.Column.VectorKind.Vector : SchemaShape.Column.VectorKind.Scalar; + var isKey = Transformer.ValueColumnType is KeyType; + var columnType = (isKey) ? PrimitiveType.FromKind(DataKind.U4) : + Transformer.ValueColumnType; + var metadataShape = SchemaShape.Create(Transformer.ValueColumnMetadata.Schema); + foreach (var (Input, Output) in _columns) + { + if (!inputSchema.TryFindColumn(Input, out var originalColumn)) + throw Host.ExceptSchemaMismatch(nameof(inputSchema), "input", Input); + + if ((originalColumn.Kind == SchemaShape.Column.VectorKind.VariableVector || + originalColumn.Kind == SchemaShape.Column.VectorKind.Vector) && Transformer.ValueColumnType is VectorType) + throw Host.ExceptNotSupp("Column '{0}' cannot be mapped to values when the column and the map values are both vector type.", Input); + // Create the Value column + var col = new SchemaShape.Column(Output, vectorKind, columnType, isKey, metadataShape); + resultDic[Output] = col; + } + return new SchemaShape(resultDic.Values); + } + } + /// /// The ValueMappingEstimator is a 1-1 mapping from a key to value. The key type and value type are specified /// through TKey and TValue. TKey is always a scalar. TValue can be either a scalar or an array (array is only possible when input is scalar). @@ -410,7 +459,7 @@ public sealed class Arguments public bool ValuesAsKeyType = true; } - protected ValueMappingTransformer(IHostEnvironment env, IDataView lookupMap, + internal ValueMappingTransformer(IHostEnvironment env, IDataView lookupMap, string keyColumn, string valueColumn, (string input, string output)[] columns) : base(Contracts.CheckRef(env, nameof(env)).Register(nameof(ValueMappingTransformer)), columns) { diff --git a/test/Microsoft.ML.Tests/Transformers/ValueMappingTests.cs b/test/Microsoft.ML.Tests/Transformers/ValueMappingTests.cs index b4a9274e6b..7285072f94 100644 --- a/test/Microsoft.ML.Tests/Transformers/ValueMappingTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/ValueMappingTests.cs @@ -177,6 +177,46 @@ public void ValueMapVectorValueTest() Assert.Equal(values[0].Length, fValue.Length); } + class Map + { + public string Key; + public int Value; + } + + [Fact] + public void ValueMapDataViewAsMapTest() + { + var data = new[] { new TestClass() { A = "bar", B = "test", C = "foo" } }; + var dataView = ML.Data.ReadFromEnumerable(data); + + var map = new[] { new Map() { Key = "foo", Value = 1 }, + new Map() { Key = "bar", Value = 2 }, + new Map() { Key = "test", Value = 3 }, + new Map() { Key = "wahoo", Value = 4 } + }; + var mapView = ML.Data.ReadFromEnumerable(map); + + var estimator = new ValueMappingEstimator(Env, mapView, "Key", "Value", new[] { ("A", "D"), ("B", "E"), ("C", "F") }); + var t = estimator.Fit(dataView); + + var result = t.Transform(dataView); + var cursor = result.GetRowCursorForAllColumns(); + var getterD = cursor.GetGetter(result.Schema["D"].Index); + var getterE = cursor.GetGetter(result.Schema["E"].Index); + var getterF = cursor.GetGetter(result.Schema["F"].Index); + cursor.MoveNext(); + + int dValue = 0; + getterD(ref dValue); + Assert.Equal(2, dValue); + int eValue = 0; + getterE(ref eValue); + Assert.Equal(3, eValue); + int fValue = 0; + getterF(ref fValue); + Assert.Equal(1, fValue); + } + [Fact] public void ValueMapVectorStringValueTest() { From 8415c9af94adf9989acf4e9be1cee3aac346aeaf Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Fri, 25 Jan 2019 11:34:09 -0800 Subject: [PATCH 02/11] Added test to show tensorflow text classification scenario. --- .../Transformers/ValueMappingTests.cs | 65 +++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/test/Microsoft.ML.Tests/Transformers/ValueMappingTests.cs b/test/Microsoft.ML.Tests/Transformers/ValueMappingTests.cs index 7285072f94..20c24b5fe4 100644 --- a/test/Microsoft.ML.Tests/Transformers/ValueMappingTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/ValueMappingTests.cs @@ -11,6 +11,7 @@ using Microsoft.ML.Model; using Microsoft.ML.RunTests; using Microsoft.ML.Tools; +using Microsoft.ML.Transforms; using Microsoft.ML.Transforms.Conversions; using Microsoft.ML.Transforms.Text; using Xunit; @@ -110,6 +111,70 @@ public void ValueMapInputIsVectorTest() Assert.Equal(1, fValue); } + class vec + { + [VectorType(600)] + public int[] Features; + } + + IEnumerable PadSentence(IDataView dataview) + { + var cursor = dataview.GetRowCursorForAllColumns(); + var getterVecD = cursor.GetGetter>(dataview.Schema["VecD"].Index); + while(cursor.MoveNext()) + { + + var v = new vec() + { + Features = new int[600] + }; + + VBuffer dValue = default; + getterVecD(ref dValue); + var values = dValue.GetValues(); + var indices = dValue.GetIndices(); + if (indices.Length > 0) + { + for (int i = 0; i < indices.Length; i++) + { + if (indices[i] > v.Features.Length) break; + v.Features[indices[i]] = values[i]; + } + } + else + { + for (int i = 0; i < values.Length; i++) + { + if (i > v.Features.Length) break; + v.Features[i] = values[i]; + } + } + + yield return v; + } + } + + [Fact] + public void ValueMapPadInputTest() + { + var data = new[] { new TestClass() { A = "bar test foo", B = "test", C = "foo" } }; + var dataView = ML.Data.ReadFromEnumerable(data); + + var keys = new List>() { "foo".AsMemory(), "bar".AsMemory(), "test".AsMemory(), "wahoo".AsMemory() }; + var values = new List() { 1, 2, 3, 4 }; + + var estimator = new WordTokenizingEstimator(Env, new[]{ + new WordTokenizingTransformer.ColumnInfo("A", "TokenizeA") + }).Append(new ValueMappingEstimator, int>(Env, keys, values, new[] { ("TokenizeA", "VecD"), ("B", "E"), ("C", "F") })); + var t = estimator.Fit(dataView); + + var result = t.Transform(dataView); + result = ML.Data.ReadFromEnumerable(PadSentence(result)); + + string modelLocation = @"E:\Tensorflow\sentiment_model"; + IDataView trans = new TensorFlowTransformer(ML, modelLocation, "Features", "Prediction/Softmax").Transform(result); + } + [Fact] public void ValueMapInputIsVectorAndValueAsStringKeyTypeTest() { From 3e4bbcdcea9cada49115586946cf2e75a4c1f0c0 Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Mon, 28 Jan 2019 13:34:52 -0800 Subject: [PATCH 03/11] Updated TensorFlow version. --- build/Dependencies.props | 2 +- ...orflow-cpu-darwin-x86_64-1.10.0.tar.gz.sha | 1 - ...orflow-cpu-darwin-x86_64-1.12.0.tar.gz.sha | 1 + ...sorflow-cpu-linux-x86_64-1.10.0.tar.gz.sha | 1 - ...sorflow-cpu-linux-x86_64-1.12.0.tar.gz.sha | 1 + ...nsorflow-cpu-windows-x86_64-1.10.0.zip.sha | 1 - ...nsorflow-cpu-windows-x86_64-1.12.0.zip.sha | 1 + .../TensorflowTests.cs | 49 ++++++++++++++ .../Transformers/ValueMappingTests.cs | 64 ------------------- 9 files changed, 53 insertions(+), 68 deletions(-) delete mode 100644 src/Redist/Microsoft.ML.TensorFlow.Redist/libtensorflow-cpu-darwin-x86_64-1.10.0.tar.gz.sha create mode 100644 src/Redist/Microsoft.ML.TensorFlow.Redist/libtensorflow-cpu-darwin-x86_64-1.12.0.tar.gz.sha delete mode 100644 src/Redist/Microsoft.ML.TensorFlow.Redist/libtensorflow-cpu-linux-x86_64-1.10.0.tar.gz.sha create mode 100644 src/Redist/Microsoft.ML.TensorFlow.Redist/libtensorflow-cpu-linux-x86_64-1.12.0.tar.gz.sha delete mode 100644 src/Redist/Microsoft.ML.TensorFlow.Redist/libtensorflow-cpu-windows-x86_64-1.10.0.zip.sha create mode 100644 src/Redist/Microsoft.ML.TensorFlow.Redist/libtensorflow-cpu-windows-x86_64-1.12.0.zip.sha diff --git a/build/Dependencies.props b/build/Dependencies.props index a251c7dcbd..49e2d5ce87 100644 --- a/build/Dependencies.props +++ b/build/Dependencies.props @@ -21,7 +21,7 @@ 4.5.0 4.5.0 4.5.0 - 1.10.0 + 1.12.0 diff --git a/src/Redist/Microsoft.ML.TensorFlow.Redist/libtensorflow-cpu-darwin-x86_64-1.10.0.tar.gz.sha b/src/Redist/Microsoft.ML.TensorFlow.Redist/libtensorflow-cpu-darwin-x86_64-1.10.0.tar.gz.sha deleted file mode 100644 index da8b53866b..0000000000 --- a/src/Redist/Microsoft.ML.TensorFlow.Redist/libtensorflow-cpu-darwin-x86_64-1.10.0.tar.gz.sha +++ /dev/null @@ -1 +0,0 @@ -77218EC4DA96A73B15B8AA5637C9F21B389510A9FAF4DCF06DF5B81A5403015C6BA3EEE29BD8BA5B0694F40C671D8E6722D554C4F93F95C33F29AB491C70263C \ No newline at end of file diff --git a/src/Redist/Microsoft.ML.TensorFlow.Redist/libtensorflow-cpu-darwin-x86_64-1.12.0.tar.gz.sha b/src/Redist/Microsoft.ML.TensorFlow.Redist/libtensorflow-cpu-darwin-x86_64-1.12.0.tar.gz.sha new file mode 100644 index 0000000000..d1167ceb9b --- /dev/null +++ b/src/Redist/Microsoft.ML.TensorFlow.Redist/libtensorflow-cpu-darwin-x86_64-1.12.0.tar.gz.sha @@ -0,0 +1 @@ +090706417EC29D91EEEABC5C25576374A86426CF25F27556C0EED4FD815D814C4F09FA7389ED8F614E4B34BF6438B9AE0ADA402BEA7CC9441446AB783A6F187D \ No newline at end of file diff --git a/src/Redist/Microsoft.ML.TensorFlow.Redist/libtensorflow-cpu-linux-x86_64-1.10.0.tar.gz.sha b/src/Redist/Microsoft.ML.TensorFlow.Redist/libtensorflow-cpu-linux-x86_64-1.10.0.tar.gz.sha deleted file mode 100644 index 6b865984ec..0000000000 --- a/src/Redist/Microsoft.ML.TensorFlow.Redist/libtensorflow-cpu-linux-x86_64-1.10.0.tar.gz.sha +++ /dev/null @@ -1 +0,0 @@ -B9E9CD95BC6A28297ACAB0D684FBBFAFF1F9AE893432AC2D208120D767101AC20E2C55BC79E59DBE6E5BD9EC802026694960FA12137BB303061C5A21B62BD29E \ No newline at end of file diff --git a/src/Redist/Microsoft.ML.TensorFlow.Redist/libtensorflow-cpu-linux-x86_64-1.12.0.tar.gz.sha b/src/Redist/Microsoft.ML.TensorFlow.Redist/libtensorflow-cpu-linux-x86_64-1.12.0.tar.gz.sha new file mode 100644 index 0000000000..92dc9e7d0a --- /dev/null +++ b/src/Redist/Microsoft.ML.TensorFlow.Redist/libtensorflow-cpu-linux-x86_64-1.12.0.tar.gz.sha @@ -0,0 +1 @@ +5359609DDF69D66474F720D6A1ED669942FEB6842096CFC3EAF44B84FA3F2F659829778446BD3C7C83871F7293CA481AC4732DF6DC7921ADA100B459E37198BD \ No newline at end of file diff --git a/src/Redist/Microsoft.ML.TensorFlow.Redist/libtensorflow-cpu-windows-x86_64-1.10.0.zip.sha b/src/Redist/Microsoft.ML.TensorFlow.Redist/libtensorflow-cpu-windows-x86_64-1.10.0.zip.sha deleted file mode 100644 index 92ce0db9fb..0000000000 --- a/src/Redist/Microsoft.ML.TensorFlow.Redist/libtensorflow-cpu-windows-x86_64-1.10.0.zip.sha +++ /dev/null @@ -1 +0,0 @@ -66F3A9522917076038AE9CCA11FE805DD516C60B3A3E156B78C2E4BD0E3E5785A9D0380C5E06411473EF14A72B72FD93F954AA3496A12D1FAF0FA3393970E700 \ No newline at end of file diff --git a/src/Redist/Microsoft.ML.TensorFlow.Redist/libtensorflow-cpu-windows-x86_64-1.12.0.zip.sha b/src/Redist/Microsoft.ML.TensorFlow.Redist/libtensorflow-cpu-windows-x86_64-1.12.0.zip.sha new file mode 100644 index 0000000000..4d5e6ce4d6 --- /dev/null +++ b/src/Redist/Microsoft.ML.TensorFlow.Redist/libtensorflow-cpu-windows-x86_64-1.12.0.zip.sha @@ -0,0 +1 @@ +49DB72CDD8D10B78BB1CD17A058DF508E04B38BD287FF53EB9173A48D3994E11741B1EE6C9108303739819845F2F9D777EE3E767D737C24DB3A28B67FF68C951 \ No newline at end of file diff --git a/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs b/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs index 646c8dd8db..99a9153c96 100644 --- a/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs +++ b/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs @@ -10,8 +10,10 @@ using Microsoft.ML.ImageAnalytics; using Microsoft.ML.RunTests; using Microsoft.ML.Transforms; +using Microsoft.ML.Transforms.Conversions; using Microsoft.ML.Transforms.Normalizers; using Microsoft.ML.Transforms.TensorFlow; +using Microsoft.ML.Transforms.Text; using Xunit; namespace Microsoft.ML.Scenarios @@ -845,5 +847,52 @@ public void TensorFlowTransformCifarInvalidShape() } Assert.True(thrown); } + + /// + /// Class to hold features and predictions. + /// + public class TensorFlowSentiment + { + public string Sentiment_Text; + [VectorType(600)] + public int[] Features; + [VectorType(2)] + public float[] Prediction; + } + + [ConditionalFact(typeof(Environment), nameof(Environment.Is64BitProcess))] + public void TensorFlowSentimentClassificationTest() + { + var mlContext = new MLContext(seed: 1, conc: 1); + var data = new[] { new TensorFlowSentiment() { Sentiment_Text = "this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert is an amazing actor and now the same being director father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also to the two little boy's that played the of norman and paul they were just brilliant children are often left out of the list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and should be praised for what they have done don't you think the whole story was so lovely because it was true and was someone's life after all that was shared with us all" } }; + var dataView = mlContext.Data.ReadFromEnumerable(data); + + var lookupMap = mlContext.Data.ReadFromTextFile(@"E:\Tensorflow\sentiment_model\imdb_word_index.csv", + columns: new[] + { + new TextLoader.Column("Words", DataKind.TX, 0), + new TextLoader.Column("Ids", DataKind.I4, 1), + }, + separatorChar: ',' + ); + + var estimator = new WordTokenizingEstimator(mlContext, new[]{ + new WordTokenizingTransformer.ColumnInfo("Sentiment_Text", "TokenizedWords") + }).Append(new ValueMappingEstimator(mlContext, lookupMap, "Words", "Ids", new[] { ("TokenizedWords", "Features") })); + var dataPipe = estimator.Fit(dataView) + .CreatePredictionEngine(mlContext); + + string modelLocation = @"E:\Tensorflow\sentiment_model"; + var tfEnginePipe = new TensorFlowEstimator(mlContext, modelLocation, new[] { "Features" }, new[] { "Prediction/Softmax" }) + .Append(new ColumnCopyingEstimator(mlContext, ("Prediction/Softmax", "Prediction"))) + .Fit(dataView) + .CreatePredictionEngine(mlContext); + + var processedData = dataPipe.Predict(data[0]); + Array.Resize(ref processedData.Features, 600); + var prediction = tfEnginePipe.Predict(processedData); + + Assert.Equal(2, prediction.Prediction.Length); + } } } diff --git a/test/Microsoft.ML.Tests/Transformers/ValueMappingTests.cs b/test/Microsoft.ML.Tests/Transformers/ValueMappingTests.cs index 20c24b5fe4..7cac5ccd34 100644 --- a/test/Microsoft.ML.Tests/Transformers/ValueMappingTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/ValueMappingTests.cs @@ -111,70 +111,6 @@ public void ValueMapInputIsVectorTest() Assert.Equal(1, fValue); } - class vec - { - [VectorType(600)] - public int[] Features; - } - - IEnumerable PadSentence(IDataView dataview) - { - var cursor = dataview.GetRowCursorForAllColumns(); - var getterVecD = cursor.GetGetter>(dataview.Schema["VecD"].Index); - while(cursor.MoveNext()) - { - - var v = new vec() - { - Features = new int[600] - }; - - VBuffer dValue = default; - getterVecD(ref dValue); - var values = dValue.GetValues(); - var indices = dValue.GetIndices(); - if (indices.Length > 0) - { - for (int i = 0; i < indices.Length; i++) - { - if (indices[i] > v.Features.Length) break; - v.Features[indices[i]] = values[i]; - } - } - else - { - for (int i = 0; i < values.Length; i++) - { - if (i > v.Features.Length) break; - v.Features[i] = values[i]; - } - } - - yield return v; - } - } - - [Fact] - public void ValueMapPadInputTest() - { - var data = new[] { new TestClass() { A = "bar test foo", B = "test", C = "foo" } }; - var dataView = ML.Data.ReadFromEnumerable(data); - - var keys = new List>() { "foo".AsMemory(), "bar".AsMemory(), "test".AsMemory(), "wahoo".AsMemory() }; - var values = new List() { 1, 2, 3, 4 }; - - var estimator = new WordTokenizingEstimator(Env, new[]{ - new WordTokenizingTransformer.ColumnInfo("A", "TokenizeA") - }).Append(new ValueMappingEstimator, int>(Env, keys, values, new[] { ("TokenizeA", "VecD"), ("B", "E"), ("C", "F") })); - var t = estimator.Fit(dataView); - - var result = t.Transform(dataView); - result = ML.Data.ReadFromEnumerable(PadSentence(result)); - - string modelLocation = @"E:\Tensorflow\sentiment_model"; - IDataView trans = new TensorFlowTransformer(ML, modelLocation, "Features", "Prediction/Softmax").Transform(result); - } - [Fact] public void ValueMapInputIsVectorAndValueAsStringKeyTypeTest() { From 8397ac02da1a0e36b61969325f4d8a97cb49037f Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Mon, 28 Jan 2019 21:09:02 -0800 Subject: [PATCH 04/11] Corrected file paths. --- .../ScenariosWithDirectInstantiation/TensorflowTests.cs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs b/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs index 2da4daa23e..d4ea925b07 100644 --- a/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs +++ b/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs @@ -868,7 +868,7 @@ public void TensorFlowSentimentClassificationTest() var data = new[] { new TensorFlowSentiment() { Sentiment_Text = "this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert is an amazing actor and now the same being director father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also to the two little boy's that played the of norman and paul they were just brilliant children are often left out of the list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and should be praised for what they have done don't you think the whole story was so lovely because it was true and was someone's life after all that was shared with us all" } }; var dataView = mlContext.Data.ReadFromEnumerable(data); - var lookupMap = mlContext.Data.ReadFromTextFile(@"E:\Tensorflow\sentiment_model\imdb_word_index.csv", + var lookupMap = mlContext.Data.ReadFromTextFile(@"sentiment_model\imdb_word_index.csv", columns: new[] { new TextLoader.Column("Words", DataKind.TX, 0), @@ -883,7 +883,7 @@ public void TensorFlowSentimentClassificationTest() var dataPipe = estimator.Fit(dataView) .CreatePredictionEngine(mlContext); - string modelLocation = @"E:\Tensorflow\sentiment_model"; + string modelLocation = @"sentiment_model"; var tfEnginePipe = new TensorFlowEstimator(mlContext, modelLocation, new[] { "Features" }, new[] { "Prediction/Softmax" }) .Append(new ColumnCopyingEstimator(mlContext, ("Prediction/Softmax", "Prediction"))) .Fit(dataView) From 0eb434b0051277c38e79e7aecadc51b469cf09b0 Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Tue, 29 Jan 2019 11:01:00 -0800 Subject: [PATCH 05/11] Addressed reviewers' comments. --- .../Transforms/ConversionsExtensionsCatalog.cs | 14 ++++++++++++++ .../Microsoft.ML.Tests.csproj | 2 +- .../TensorflowTests.cs | 17 ++++++++++++----- .../Transformers/ValueMappingTests.cs | 1 - 4 files changed, 27 insertions(+), 7 deletions(-) diff --git a/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs b/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs index b8f9688307..8e1f509bd6 100644 --- a/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs +++ b/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs @@ -141,5 +141,19 @@ public static ValueMappingEstimator ValueMap values, params (string source, string name)[] columns) => new ValueMappingEstimator(CatalogUtils.GetEnvironment(catalog), keys, values, columns); + + /// + /// Maps specified keys to specified values + /// + /// The categorical transform's catalog + /// An instance of that contains the key and value columns. + /// Name of the key column in . + /// Name of the value column in . + /// The columns to apply this transform on. + /// + public static ValueMappingEstimator ValueMap( + this TransformsCatalog.ConversionTransforms catalog, + IDataView lookupMap, string keyColumn, string valueColumn, params (string input, string output)[] columns) + => new ValueMappingEstimator(CatalogUtils.GetEnvironment(catalog), lookupMap, keyColumn, valueColumn, columns); } } diff --git a/test/Microsoft.ML.Tests/Microsoft.ML.Tests.csproj b/test/Microsoft.ML.Tests/Microsoft.ML.Tests.csproj index e655c73f33..2d56666b7f 100644 --- a/test/Microsoft.ML.Tests/Microsoft.ML.Tests.csproj +++ b/test/Microsoft.ML.Tests/Microsoft.ML.Tests.csproj @@ -46,7 +46,7 @@ - + diff --git a/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs b/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs index d4ea925b07..9858d23d30 100644 --- a/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs +++ b/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs @@ -877,15 +877,21 @@ public void TensorFlowSentimentClassificationTest() separatorChar: ',' ); - var estimator = new WordTokenizingEstimator(mlContext, new[]{ - new WordTokenizingTransformer.ColumnInfo("Sentiment_Text", "TokenizedWords") - }).Append(new ValueMappingEstimator(mlContext, lookupMap, "Words", "Ids", new[] { ("TokenizedWords", "Features") })); + // We cannot resize variable length vector to fixed lenght vector in ML.Net + // The trick here is to create two pipelines. + // The first pipeline tokenzies the strings into words and maps the words to an integer which is an index in the dictionary. + // Then this integer vector is retrieved from the pipeline and resized to fixed length. + // The second pipeline takes the resized integer vector and passed to TensoFlow and get the classification scores. + var estimator = mlContext.Transforms.Text.TokenizeWords("Sentiment_Text", "TokenizedWords") + .Append(mlContext.Transforms.Conversion.ValueMap(lookupMap, "Words", "Ids", new[] { ("TokenizedWords", "Features") })); var dataPipe = estimator.Fit(dataView) .CreatePredictionEngine(mlContext); + // For explanation on how was the `sentiment_model` created + // c.f. https://github.com/dotnet/machinelearning-testdata/blob/master/Microsoft.ML.TensorFlow.TestModels/sentiment_model/README.md string modelLocation = @"sentiment_model"; - var tfEnginePipe = new TensorFlowEstimator(mlContext, modelLocation, new[] { "Features" }, new[] { "Prediction/Softmax" }) - .Append(new ColumnCopyingEstimator(mlContext, ("Prediction/Softmax", "Prediction"))) + var tfEnginePipe = mlContext.Transforms.ScoreTensorFlowModel(modelLocation, new[] { "Features" }, new[] { "Prediction/Softmax" }) + .Append(mlContext.Transforms.CopyColumns(("Prediction/Softmax", "Prediction"))) .Fit(dataView) .CreatePredictionEngine(mlContext); @@ -894,6 +900,7 @@ public void TensorFlowSentimentClassificationTest() var prediction = tfEnginePipe.Predict(processedData); Assert.Equal(2, prediction.Prediction.Length); + Assert.InRange(prediction.Prediction[1], 0.650032759 - 0.01, 0.650032759 + 0.01); } } } diff --git a/test/Microsoft.ML.Tests/Transformers/ValueMappingTests.cs b/test/Microsoft.ML.Tests/Transformers/ValueMappingTests.cs index 8ef25b40d3..7d84dbdfa3 100644 --- a/test/Microsoft.ML.Tests/Transformers/ValueMappingTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/ValueMappingTests.cs @@ -11,7 +11,6 @@ using Microsoft.ML.Model; using Microsoft.ML.RunTests; using Microsoft.ML.Tools; -using Microsoft.ML.Transforms; using Microsoft.ML.Transforms.Conversions; using Microsoft.ML.Transforms.Text; using Xunit; From fdc08684c05cf55824ddd90a6dc33d874fd79312 Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Tue, 29 Jan 2019 11:20:24 -0800 Subject: [PATCH 06/11] Addressed reviewers' comments. --- .../ScenariosWithDirectInstantiation/TensorflowTests.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs b/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs index 9858d23d30..c41f0cf26a 100644 --- a/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs +++ b/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs @@ -877,7 +877,7 @@ public void TensorFlowSentimentClassificationTest() separatorChar: ',' ); - // We cannot resize variable length vector to fixed lenght vector in ML.Net + // We cannot resize variable length vector to fixed length vector in ML.Net // The trick here is to create two pipelines. // The first pipeline tokenzies the strings into words and maps the words to an integer which is an index in the dictionary. // Then this integer vector is retrieved from the pipeline and resized to fixed length. From daa4333e0979842952752d879f7568dd559aaa95 Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Tue, 29 Jan 2019 12:21:28 -0800 Subject: [PATCH 07/11] Fixing a problem in path. --- .../ScenariosWithDirectInstantiation/TensorflowTests.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs b/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs index c41f0cf26a..487bde6cb2 100644 --- a/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs +++ b/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs @@ -868,7 +868,7 @@ public void TensorFlowSentimentClassificationTest() var data = new[] { new TensorFlowSentiment() { Sentiment_Text = "this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert is an amazing actor and now the same being director father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also to the two little boy's that played the of norman and paul they were just brilliant children are often left out of the list i think because the stars that play them all grown up are such a big profile for the whole film but these children are amazing and should be praised for what they have done don't you think the whole story was so lovely because it was true and was someone's life after all that was shared with us all" } }; var dataView = mlContext.Data.ReadFromEnumerable(data); - var lookupMap = mlContext.Data.ReadFromTextFile(@"sentiment_model\imdb_word_index.csv", + var lookupMap = mlContext.Data.ReadFromTextFile(@"sentiment_model/imdb_word_index.csv", columns: new[] { new TextLoader.Column("Words", DataKind.TX, 0), From 0cc516e7e98efbae689c71a83330386526b0484e Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Tue, 29 Jan 2019 12:44:45 -0800 Subject: [PATCH 08/11] Addressed reviewers' comments. --- .../ScenariosWithDirectInstantiation/TensorflowTests.cs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs b/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs index 487bde6cb2..ba0701af58 100644 --- a/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs +++ b/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs @@ -877,11 +877,11 @@ public void TensorFlowSentimentClassificationTest() separatorChar: ',' ); - // We cannot resize variable length vector to fixed length vector in ML.Net + // We cannot resize variable length vector to fixed length vector in ML.NET // The trick here is to create two pipelines. - // The first pipeline tokenzies the strings into words and maps the words to an integer which is an index in the dictionary. + // The first pipeline 'dataPipe' tokenzies the string into words and maps each word to an integer which is an index in the dictionary. // Then this integer vector is retrieved from the pipeline and resized to fixed length. - // The second pipeline takes the resized integer vector and passed to TensoFlow and get the classification scores. + // The second pipeline 'tfEnginePipe' takes the resized integer vector and passed to TensoFlow and get the classification scores. var estimator = mlContext.Transforms.Text.TokenizeWords("Sentiment_Text", "TokenizedWords") .Append(mlContext.Transforms.Conversion.ValueMap(lookupMap, "Words", "Ids", new[] { ("TokenizedWords", "Features") })); var dataPipe = estimator.Fit(dataView) From ddbd9da7c8beffd3f02899420aad3c0e6c275673 Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Tue, 29 Jan 2019 13:04:22 -0800 Subject: [PATCH 09/11] Addressed reviewers' comments. --- .../Transforms/ConversionsExtensionsCatalog.cs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs b/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs index 8e1f509bd6..9057f8c3fb 100644 --- a/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs +++ b/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs @@ -143,7 +143,9 @@ public static ValueMappingEstimator ValueMap new ValueMappingEstimator(CatalogUtils.GetEnvironment(catalog), keys, values, columns); /// - /// Maps specified keys to specified values + /// Maps the using the keys in the dictionary to the values of dictionary. + /// In this case, the is used as a dictionary where + /// and specify the keys and values of dictionary respectively. /// /// The categorical transform's catalog /// An instance of that contains the key and value columns. From 57e730c85dbab925a7adaa80c20904ea5b68b63c Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Tue, 29 Jan 2019 15:08:00 -0800 Subject: [PATCH 10/11] Addressed reviewers' comments. --- .../Transforms/ConversionsExtensionsCatalog.cs | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs b/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs index 9057f8c3fb..50d88d3bda 100644 --- a/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs +++ b/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs @@ -143,8 +143,9 @@ public static ValueMappingEstimator ValueMap new ValueMappingEstimator(CatalogUtils.GetEnvironment(catalog), keys, values, columns); /// - /// Maps the using the keys in the dictionary to the values of dictionary. - /// In this case, the is used as a dictionary where + /// Maps the using the keys in the dictionary to the values of dictionary i.e. + /// a value 'x' in the would be mappped to a value stored in dictionary[x]. + /// In this case, the is used to build up the dictionary where /// and specify the keys and values of dictionary respectively. /// /// The categorical transform's catalog From f984e0f77403b503c172ce15ea593fcfc2760831 Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Tue, 29 Jan 2019 15:44:59 -0800 Subject: [PATCH 11/11] Merged with base and addressed reviewers' comments. --- .../Transforms/ConversionsExtensionsCatalog.cs | 2 +- .../TensorflowTests.cs | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs b/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs index f0a7a78079..cdcd3a83f2 100644 --- a/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs +++ b/src/Microsoft.ML.Data/Transforms/ConversionsExtensionsCatalog.cs @@ -155,7 +155,7 @@ public static ValueMappingEstimator ValueMap public static ValueMappingEstimator ValueMap( this TransformsCatalog.ConversionTransforms catalog, - IDataView lookupMap, string keyColumn, string valueColumn, params (string input, string output)[] columns) + IDataView lookupMap, string keyColumn, string valueColumn, params (string outputColumnName, string inputColumnName)[] columns) => new ValueMappingEstimator(CatalogUtils.GetEnvironment(catalog), lookupMap, keyColumn, valueColumn, columns); } } diff --git a/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs b/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs index 8076670e36..bb46984a38 100644 --- a/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs +++ b/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs @@ -881,17 +881,17 @@ public void TensorFlowSentimentClassificationTest() // The trick here is to create two pipelines. // The first pipeline 'dataPipe' tokenzies the string into words and maps each word to an integer which is an index in the dictionary. // Then this integer vector is retrieved from the pipeline and resized to fixed length. - // The second pipeline 'tfEnginePipe' takes the resized integer vector and passed to TensoFlow and get the classification scores. - var estimator = mlContext.Transforms.Text.TokenizeWords("Sentiment_Text", "TokenizedWords") - .Append(mlContext.Transforms.Conversion.ValueMap(lookupMap, "Words", "Ids", new[] { ("TokenizedWords", "Features") })); + // The second pipeline 'tfEnginePipe' takes the resized integer vector and passes it to TensoFlow and gets the classification scores. + var estimator = mlContext.Transforms.Text.TokenizeWords("TokenizedWords", "Sentiment_Text") + .Append(mlContext.Transforms.Conversion.ValueMap(lookupMap, "Words", "Ids", new[] { ("Features", "TokenizedWords") })); var dataPipe = estimator.Fit(dataView) .CreatePredictionEngine(mlContext); // For explanation on how was the `sentiment_model` created // c.f. https://github.com/dotnet/machinelearning-testdata/blob/master/Microsoft.ML.TensorFlow.TestModels/sentiment_model/README.md string modelLocation = @"sentiment_model"; - var tfEnginePipe = mlContext.Transforms.ScoreTensorFlowModel(modelLocation, new[] { "Features" }, new[] { "Prediction/Softmax" }) - .Append(mlContext.Transforms.CopyColumns(("Prediction/Softmax", "Prediction"))) + var tfEnginePipe = mlContext.Transforms.ScoreTensorFlowModel(modelLocation, new[] { "Prediction/Softmax" }, new[] { "Features" }) + .Append(mlContext.Transforms.CopyColumns(("Prediction", "Prediction/Softmax"))) .Fit(dataView) .CreatePredictionEngine(mlContext);