From cbd8ad36a85e50a8b6859d4a3d72f6f2763bb8f0 Mon Sep 17 00:00:00 2001 From: Abhishek Goswami Date: Thu, 29 Nov 2018 21:02:59 +0000 Subject: [PATCH 1/2] Adding LDA sample to Microsoft.ML.Samples --- .../Dynamic/LdaTransform.cs | 66 +++++++++++++++++++ .../Microsoft.ML.Samples.csproj | 1 + docs/samples/Microsoft.ML.Samples/Program.cs | 2 +- .../Text/TextCatalog.cs | 7 ++ 4 files changed, 75 insertions(+), 1 deletion(-) create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/LdaTransform.cs diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/LdaTransform.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/LdaTransform.cs new file mode 100644 index 0000000000..231f636f71 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/LdaTransform.cs @@ -0,0 +1,66 @@ +using Microsoft.ML.Data; +using Microsoft.ML.Runtime.Api; +using Microsoft.ML.Runtime.Data; +using System; +using System.Collections.Generic; + +namespace Microsoft.ML.Samples.Dynamic +{ + public class LdaTransformExample + { + public static void LdaTransform() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var ml = new MLContext(); + + // Get a small dataset as an IEnumerable. + IEnumerable data = SamplesUtils.DatasetUtils.GetTopicsData(); + var trainData = ml.CreateStreamingDataView(data); + + // Preview of one of the columns of the the topics data. + // The Review column contains the keys associated with a particular body of text. + // + // Review + // "animals birds cats dogs fish horse" + // "horse birds house fish duck cats" + // "car truck driver bus pickup" + // "car truck driver bus pickup horse" + + // A pipeline for featurizing the "Review" column + string ldaFeatures = "LdaFeatures"; + var pipeline = ml.Transforms.Text.ProduceWordBags("Review"). + Append(ml.Transforms.Text.LatentDirichletAllocation("Review", ldaFeatures, numTopic:3)); + + // The transformed data + var transformer = pipeline.Fit(trainData); + var transformed_data = transformer.Transform(trainData); + + // Small helper to print the text inside the columns, in the console. + Action>> printHelper = (columnName, column) => + { + Console.WriteLine($"{columnName} column obtained post-transformation."); + foreach (var featureRow in column) + { + foreach (var value in featureRow.GetValues()) + Console.Write($"{value} "); + Console.WriteLine(""); + } + + Console.WriteLine("==================================================="); + }; + + // Preview of the column obtained after processing the input. + var defaultColumn = transformed_data.GetColumn>(ml, ldaFeatures); + printHelper(ldaFeatures, defaultColumn); + + // LdaFeatures column obtained post-transformation. + // For LDA, we had specified numTopic:3. Hence each row of text has been featurized as a vector of floats with length 3. + + //0.1818182 0.4545455 0.3636364 + //0.3636364 0.1818182 0.4545455 + //0.2222222 0.2222222 0.5555556 + //0.2727273 0.09090909 0.6363636 + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj b/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj index 2b9041c51b..abea9f3e8d 100644 --- a/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj +++ b/docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj @@ -17,6 +17,7 @@ + false diff --git a/docs/samples/Microsoft.ML.Samples/Program.cs b/docs/samples/Microsoft.ML.Samples/Program.cs index d8730391b2..83a7e8c2e8 100644 --- a/docs/samples/Microsoft.ML.Samples/Program.cs +++ b/docs/samples/Microsoft.ML.Samples/Program.cs @@ -6,7 +6,7 @@ internal static class Program { static void Main(string[] args) { - MatrixFactorizationExample.MatrixFactorizationInMemoryData(); + LdaTransformExample.LdaTransform(); } } } diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs index aaed670b0b..52cf642d2f 100644 --- a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs +++ b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs @@ -507,6 +507,13 @@ public static NgramHashEstimator ProduceHashedNgrams(this TransformsCatalog.Text /// The number of words to summarize the topic. /// The number of burn-in iterations. /// Reset the random number generator for each document. + /// + /// + /// + /// + /// public static LatentDirichletAllocationEstimator LatentDirichletAllocation(this TransformsCatalog.TextTransforms catalog, string inputColumn, string outputColumn = null, From 1014be4c44309b71e6f4042beaeddf5e93486f3c Mon Sep 17 00:00:00 2001 From: Abhishek Goswami Date: Thu, 29 Nov 2018 22:05:03 +0000 Subject: [PATCH 2/2] review comments - 1. nit changes --- .../Dynamic/LdaTransform.cs | 33 +++++++++---------- 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/LdaTransform.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/LdaTransform.cs index 231f636f71..f2cbec7c28 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/LdaTransform.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/LdaTransform.cs @@ -27,32 +27,29 @@ public static void LdaTransform() // "car truck driver bus pickup" // "car truck driver bus pickup horse" - // A pipeline for featurizing the "Review" column + string review = nameof(SamplesUtils.DatasetUtils.SampleTopicsData.Review); string ldaFeatures = "LdaFeatures"; - var pipeline = ml.Transforms.Text.ProduceWordBags("Review"). - Append(ml.Transforms.Text.LatentDirichletAllocation("Review", ldaFeatures, numTopic:3)); + + // A pipeline for featurizing the "Review" column + var pipeline = ml.Transforms.Text.ProduceWordBags(review). + Append(ml.Transforms.Text.LatentDirichletAllocation(review, ldaFeatures, numTopic:3)); // The transformed data var transformer = pipeline.Fit(trainData); var transformed_data = transformer.Transform(trainData); - // Small helper to print the text inside the columns, in the console. - Action>> printHelper = (columnName, column) => - { - Console.WriteLine($"{columnName} column obtained post-transformation."); - foreach (var featureRow in column) - { - foreach (var value in featureRow.GetValues()) - Console.Write($"{value} "); - Console.WriteLine(""); - } + // Column obtained after processing the input. + var ldaFeaturesColumn = transformed_data.GetColumn>(ml, ldaFeatures); - Console.WriteLine("==================================================="); - }; + Console.WriteLine($"{ldaFeatures} column obtained post-transformation."); + foreach (var featureRow in ldaFeaturesColumn) + { + foreach (var value in featureRow.GetValues()) + Console.Write($"{value} "); + Console.WriteLine(""); + } - // Preview of the column obtained after processing the input. - var defaultColumn = transformed_data.GetColumn>(ml, ldaFeatures); - printHelper(ldaFeatures, defaultColumn); + Console.WriteLine("==================================================="); // LdaFeatures column obtained post-transformation. // For LDA, we had specified numTopic:3. Hence each row of text has been featurized as a vector of floats with length 3.