diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/LdaTransform.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/LdaTransform.cs deleted file mode 100644 index 239e7d93ac..0000000000 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/LdaTransform.cs +++ /dev/null @@ -1,61 +0,0 @@ -using System; -using System.Collections.Generic; -using Microsoft.ML.Data; - -namespace Microsoft.ML.Samples.Dynamic -{ - public static class LatentDirichletAllocationTransform - { - public static void Example() - { - // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, - // as well as the source of randomness. - var ml = new MLContext(); - - // Get a small dataset as an IEnumerable and then read it as a ML.NET data set. - IEnumerable data = SamplesUtils.DatasetUtils.GetTopicsData(); - var trainData = ml.Data.LoadFromEnumerable(data); - - // Preview of one of the columns of the the topics data. - // The Review column contains the keys associated with a particular body of text. - // - // Review - // "animals birds cats dogs fish horse" - // "horse birds house fish duck cats" - // "car truck driver bus pickup" - // "car truck driver bus pickup horse" - - string review = nameof(SamplesUtils.DatasetUtils.SampleTopicsData.Review); - string ldaFeatures = "LdaFeatures"; - - // A pipeline for featurizing the "Review" column - var pipeline = ml.Transforms.Text.ProduceWordBags(review). - Append(ml.Transforms.Text.LatentDirichletAllocation(review, ldaFeatures, numberOfTopics: 3)); - - // The transformed data - var transformer = pipeline.Fit(trainData); - var transformed_data = transformer.Transform(trainData); - - // Column obtained after processing the input. - var ldaFeaturesColumn = transformed_data.GetColumn>(transformed_data.Schema[ldaFeatures]); - - Console.WriteLine($"{ldaFeatures} column obtained post-transformation."); - foreach (var featureRow in ldaFeaturesColumn) - { - foreach (var value in featureRow.GetValues()) - Console.Write($"{value} "); - Console.WriteLine(""); - } - - Console.WriteLine("==================================================="); - - // LdaFeatures column obtained post-transformation. - // For LDA, we had specified numTopic:3. Hence each row of text has been featurized as a vector of floats with length 3. - - //0.1818182 0.4545455 0.3636364 - //0.3636364 0.1818182 0.4545455 - //0.2222222 0.2222222 0.5555556 - //0.2727273 0.09090909 0.6363636 - } - } -} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/LatentDirichletAllocation.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/LatentDirichletAllocation.cs new file mode 100644 index 0000000000..3efe734f06 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/LatentDirichletAllocation.cs @@ -0,0 +1,74 @@ +using System; +using System.Collections.Generic; +using Microsoft.ML.Data; + +namespace Microsoft.ML.Samples.Dynamic +{ + public static class LatentDirichletAllocation + { + public static void Example() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var mlContext = new MLContext(); + + // Create a small dataset as an IEnumerable. + var samples = new List() + { + new TextData(){ Text = "ML.NET's LatentDirichletAllocation API computes topic models." }, + new TextData(){ Text = "ML.NET's LatentDirichletAllocation API is the best for topic models." }, + new TextData(){ Text = "I like to eat broccoli and bananas." }, + new TextData(){ Text = "I eat bananas for breakfast." }, + new TextData(){ Text = "This car is expensive compared to last week's price." }, + new TextData(){ Text = "This car was $X last week." }, + }; + + // Convert training data to IDataView. + var dataview = mlContext.Data.LoadFromEnumerable(samples); + + // A pipeline for featurizing the text/string using LatentDirichletAllocation API. + // To be more accurate in computing the LDA features, the pipeline first normalizes text and removes stop words + // before passing tokens (the individual words, lower cased, with common words removed) to LatentDirichletAllocation. + var pipeline = mlContext.Transforms.Text.NormalizeText("NormalizedText", "Text") + .Append(mlContext.Transforms.Text.TokenizeIntoWords("Tokens", "NormalizedText")) + .Append(mlContext.Transforms.Text.RemoveDefaultStopWords("Tokens")) + .Append(mlContext.Transforms.Conversion.MapValueToKey("Tokens")) + .Append(mlContext.Transforms.Text.ProduceNgrams("Tokens")) + .Append(mlContext.Transforms.Text.LatentDirichletAllocation("Features", "Tokens", numberOfTopics: 3)); + + // Fit to data. + var transformer = pipeline.Fit(dataview); + + // Create the prediction engine to get the LDA features extracted from the text. + var predictionEngine = mlContext.Model.CreatePredictionEngine(transformer); + + // Convert the sample text into LDA features and print it. + PrintLdaFeatures(predictionEngine.Predict(samples[0])); + PrintLdaFeatures(predictionEngine.Predict(samples[1])); + + // Features obtained post-transformation. + // For LatentDirichletAllocation, we had specified numTopic:3. Hence each prediction has been featurized as a vector of floats with length 3. + + // Topic1 Topic2 Topic3 + // 0.6364 0.2727 0.0909 + // 0.5455 0.1818 0.2727 + } + + private static void PrintLdaFeatures(TransformedTextData prediction) + { + for (int i = 0; i < prediction.Features.Length; i++) + Console.Write($"{prediction.Features[i]:F4} "); + Console.WriteLine(); + } + + private class TextData + { + public string Text { get; set; } + } + + private class TransformedTextData : TextData + { + public float[] Features { get; set; } + } + } +} diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs index db412be77c..5ddd402b2a 100644 --- a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs +++ b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs @@ -509,7 +509,7 @@ internal static NgramHashingEstimator ProduceHashedNgrams(this TransformsCatalog /// /// /// /// ///