|
| 1 | +using System; |
| 2 | +using System.Collections.Generic; |
| 3 | +using System.Text; |
| 4 | +using Microsoft.ML.Data; |
| 5 | +using Microsoft.ML.Transforms.Text; |
| 6 | + |
| 7 | +namespace Microsoft.ML.Samples.Dynamic |
| 8 | +{ |
| 9 | + public static class ProduceWordBags |
| 10 | + { |
| 11 | + public static void Example() |
| 12 | + { |
| 13 | + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, |
| 14 | + // as well as the source of randomness. |
| 15 | + var mlContext = new MLContext(); |
| 16 | + |
| 17 | + // Create a small dataset as an IEnumerable. |
| 18 | + var samples = new List<TextData>() |
| 19 | + { |
| 20 | + new TextData(){ Text = "This is an example to compute bag-of-word features." }, |
| 21 | + new TextData(){ Text = "ML.NET's ProduceWordBags API produces bag-of-word features from input text." }, |
| 22 | + new TextData(){ Text = "It does so by first tokenizing text/string into words/tokens then " }, |
| 23 | + new TextData(){ Text = "computing n-grams and their neumeric values." }, |
| 24 | + new TextData(){ Text = "Each position in the output vector corresponds to a particular n-gram." }, |
| 25 | + new TextData(){ Text = "The value at each position corresponds to," }, |
| 26 | + new TextData(){ Text = "the number of times n-gram occured in the data (Tf), or" }, |
| 27 | + new TextData(){ Text = "the inverse of the number of documents contain the n-gram (Idf)," }, |
| 28 | + new TextData(){ Text = "or compute both and multipy together (Tf-Idf)." }, |
| 29 | + }; |
| 30 | + |
| 31 | + // Convert training data to IDataView. |
| 32 | + var dataview = mlContext.Data.LoadFromEnumerable(samples); |
| 33 | + |
| 34 | + // A pipeline for converting text into numeric bag-of-word features. |
| 35 | + // The following call to 'ProduceWordBags' implicitly tokenizes the text/string into words/tokens. |
| 36 | + // Please note that the length of the output feature vector depends on the n-gram settings. |
| 37 | + var textPipeline = mlContext.Transforms.Text.ProduceWordBags("BagOfWordFeatures", "Text", |
| 38 | + ngramLength: 3, useAllLengths: false, weighting: NgramExtractingEstimator.WeightingCriteria.Tf); |
| 39 | + |
| 40 | + // Fit to data. |
| 41 | + var textTransformer = textPipeline.Fit(dataview); |
| 42 | + var transformedDataView = textTransformer.Transform(dataview); |
| 43 | + |
| 44 | + // Create the prediction engine to get the bag-of-word features extracted from the text. |
| 45 | + var predictionEngine = mlContext.Model.CreatePredictionEngine<TextData, TransformedTextData>(textTransformer); |
| 46 | + |
| 47 | + // Convert the text into numeric features. |
| 48 | + var prediction = predictionEngine.Predict(samples[0]); |
| 49 | + |
| 50 | + // Print the length of the feature vector. |
| 51 | + Console.WriteLine($"Number of Features: {prediction.BagOfWordFeatures.Length}"); |
| 52 | + |
| 53 | + // Preview of the produced n-grams. |
| 54 | + // Get the slot names from the column's metadata. |
| 55 | + // The slot names for a vector column corresponds to the names associated with each position in the vector. |
| 56 | + VBuffer<ReadOnlyMemory<char>> slotNames = default; |
| 57 | + transformedDataView.Schema["BagOfWordFeatures"].GetSlotNames(ref slotNames); |
| 58 | + var BagOfWordFeaturesColumn = transformedDataView.GetColumn<VBuffer<float>>(transformedDataView.Schema["BagOfWordFeatures"]); |
| 59 | + var slots = slotNames.GetValues(); |
| 60 | + Console.Write("N-grams: "); |
| 61 | + foreach (var featureRow in BagOfWordFeaturesColumn) |
| 62 | + { |
| 63 | + foreach (var item in featureRow.Items()) |
| 64 | + Console.Write($"{slots[item.Key]} "); |
| 65 | + Console.WriteLine(); |
| 66 | + } |
| 67 | + |
| 68 | + // Print the first 10 feature values. |
| 69 | + Console.Write("Features: "); |
| 70 | + for (int i = 0; i < 10; i++) |
| 71 | + Console.Write($"{prediction.BagOfWordFeatures[i]:F4} "); |
| 72 | + |
| 73 | + // Expected output: |
| 74 | + // Number of Features: 62 |
| 75 | + // N-grams: This|is|an is|an|example an|example|to example|to|compute to|compute|bag-of-word compute|bag-of-word|features. ML.NET's|ProduceWordBags|API ProduceWordBags|API|produces API|produces|bag-of-word produces|bag-of-word|features ... |
| 76 | + // Features: 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 0.0000 0.0000 0.0000 0.0000 ... |
| 77 | + } |
| 78 | + |
| 79 | + private class TextData |
| 80 | + { |
| 81 | + public string Text { get; set; } |
| 82 | + } |
| 83 | + |
| 84 | + private class TransformedTextData : TextData |
| 85 | + { |
| 86 | + public float[] BagOfWordFeatures { get; set; } |
| 87 | + } |
| 88 | + } |
| 89 | +} |
0 commit comments