diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedWordBags.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedWordBags.cs new file mode 100644 index 0000000000..8a23517149 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedWordBags.cs @@ -0,0 +1,89 @@ +using System; +using System.Collections.Generic; +using System.Text; +using Microsoft.ML.Data; +using Microsoft.ML.Transforms.Text; + +namespace Microsoft.ML.Samples.Dynamic +{ + public static class ProduceHashedWordBags + { + public static void Example() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var mlContext = new MLContext(); + + // Create a small dataset as an IEnumerable. + var samples = new List() + { + new TextData(){ Text = "This is an example to compute bag-of-word features using hashing." }, + new TextData(){ Text = "ML.NET's ProduceHashedWordBags API produces count of n-grams and hashes it as an index into a vector of given bit length." }, + new TextData(){ Text = "It does so by first tokenizing text/string into words/tokens then " }, + new TextData(){ Text = "computing n-grams and hash them to the index given by hash value." }, + new TextData(){ Text = "The hashing reduces the size of the output feature vector" }, + new TextData(){ Text = "which is useful in case when number of n-grams is very large." }, + }; + + // Convert training data to IDataView. + var dataview = mlContext.Data.LoadFromEnumerable(samples); + + // A pipeline for converting text into numeric bag-of-word features using hashing. + // The following call to 'ProduceHashedWordBags' implicitly tokenizes the text/string into words/tokens. + // Please note that the length of the output feature vector depends on the 'numberOfBits' settings. + var textPipeline = mlContext.Transforms.Text.ProduceHashedWordBags("BagOfWordFeatures", "Text", + numberOfBits: 5, + ngramLength: 3, + useAllLengths: false, + maximumNumberOfInverts: 1); + + // Fit to data. + var textTransformer = textPipeline.Fit(dataview); + var transformedDataView = textTransformer.Transform(dataview); + + // Create the prediction engine to get the bag-of-word features extracted from the text. + var predictionEngine = mlContext.Model.CreatePredictionEngine(textTransformer); + + // Convert the text into numeric features. + var prediction = predictionEngine.Predict(samples[0]); + + // Print the length of the feature vector. + Console.WriteLine($"Number of Features: {prediction.BagOfWordFeatures.Length}"); + + // Preview of the produced n-grams. + // Get the slot names from the column's metadata. + // The slot names for a vector column corresponds to the names associated with each position in the vector. + VBuffer> slotNames = default; + transformedDataView.Schema["BagOfWordFeatures"].GetSlotNames(ref slotNames); + var BagOfWordFeaturesColumn = transformedDataView.GetColumn>(transformedDataView.Schema["BagOfWordFeatures"]); + var slots = slotNames.GetValues(); + Console.Write("N-grams: "); + foreach (var featureRow in BagOfWordFeaturesColumn) + { + foreach (var item in featureRow.Items()) + Console.Write($"{slots[item.Key]} "); + Console.WriteLine(); + } + + // Print the first 10 feature values. + Console.Write("Features: "); + for (int i = 0; i < 10; i++) + Console.Write($"{prediction.BagOfWordFeatures[i]:F4} "); + + // Expected output: + // Number of Features: 32 + // N-grams: an|example|to is|an|example example|to|compute This|is|an compute|bag-of-word|features bag-of-word|features|using to|compute|bag-of-word ML.NET's|ProduceHashedWordBags|API as|an|index API|produces|count ... + // Features: 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 1.0000 2.0000 0.0000 0.0000 ... + } + + private class TextData + { + public string Text { get; set; } + } + + private class TransformedTextData : TextData + { + public float[] BagOfWordFeatures { get; set; } + } + } +} diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceWordBags.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceWordBags.cs new file mode 100644 index 0000000000..9e9ab553a9 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceWordBags.cs @@ -0,0 +1,89 @@ +using System; +using System.Collections.Generic; +using System.Text; +using Microsoft.ML.Data; +using Microsoft.ML.Transforms.Text; + +namespace Microsoft.ML.Samples.Dynamic +{ + public static class ProduceWordBags + { + public static void Example() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var mlContext = new MLContext(); + + // Create a small dataset as an IEnumerable. + var samples = new List() + { + new TextData(){ Text = "This is an example to compute bag-of-word features." }, + new TextData(){ Text = "ML.NET's ProduceWordBags API produces bag-of-word features from input text." }, + new TextData(){ Text = "It does so by first tokenizing text/string into words/tokens then " }, + new TextData(){ Text = "computing n-grams and their neumeric values." }, + new TextData(){ Text = "Each position in the output vector corresponds to a particular n-gram." }, + new TextData(){ Text = "The value at each position corresponds to," }, + new TextData(){ Text = "the number of times n-gram occured in the data (Tf), or" }, + new TextData(){ Text = "the inverse of the number of documents contain the n-gram (Idf)," }, + new TextData(){ Text = "or compute both and multipy together (Tf-Idf)." }, + }; + + // Convert training data to IDataView. + var dataview = mlContext.Data.LoadFromEnumerable(samples); + + // A pipeline for converting text into numeric bag-of-word features. + // The following call to 'ProduceWordBags' implicitly tokenizes the text/string into words/tokens. + // Please note that the length of the output feature vector depends on the n-gram settings. + var textPipeline = mlContext.Transforms.Text.ProduceWordBags("BagOfWordFeatures", "Text", + ngramLength: 3, useAllLengths: false, weighting: NgramExtractingEstimator.WeightingCriteria.Tf); + + // Fit to data. + var textTransformer = textPipeline.Fit(dataview); + var transformedDataView = textTransformer.Transform(dataview); + + // Create the prediction engine to get the bag-of-word features extracted from the text. + var predictionEngine = mlContext.Model.CreatePredictionEngine(textTransformer); + + // Convert the text into numeric features. + var prediction = predictionEngine.Predict(samples[0]); + + // Print the length of the feature vector. + Console.WriteLine($"Number of Features: {prediction.BagOfWordFeatures.Length}"); + + // Preview of the produced n-grams. + // Get the slot names from the column's metadata. + // The slot names for a vector column corresponds to the names associated with each position in the vector. + VBuffer> slotNames = default; + transformedDataView.Schema["BagOfWordFeatures"].GetSlotNames(ref slotNames); + var BagOfWordFeaturesColumn = transformedDataView.GetColumn>(transformedDataView.Schema["BagOfWordFeatures"]); + var slots = slotNames.GetValues(); + Console.Write("N-grams: "); + foreach (var featureRow in BagOfWordFeaturesColumn) + { + foreach (var item in featureRow.Items()) + Console.Write($"{slots[item.Key]} "); + Console.WriteLine(); + } + + // Print the first 10 feature values. + Console.Write("Features: "); + for (int i = 0; i < 10; i++) + Console.Write($"{prediction.BagOfWordFeatures[i]:F4} "); + + // Expected output: + // Number of Features: 62 + // N-grams: This|is|an is|an|example an|example|to example|to|compute to|compute|bag-of-word compute|bag-of-word|features. ML.NET's|ProduceWordBags|API ProduceWordBags|API|produces API|produces|bag-of-word produces|bag-of-word|features ... + // Features: 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 0.0000 0.0000 0.0000 0.0000 ... + } + + private class TextData + { + public string Text { get; set; } + } + + private class TransformedTextData : TextData + { + public float[] BagOfWordFeatures { get; set; } + } + } +}