|
| 1 | +using System; |
| 2 | +using System.Collections.Generic; |
| 3 | +using Microsoft.ML.Data; |
| 4 | +using Microsoft.ML.Transforms.Text; |
| 5 | + |
| 6 | +namespace Microsoft.ML.Samples.Dynamic |
| 7 | +{ |
| 8 | + public static class FeaturizeTextWithOptions |
| 9 | + { |
| 10 | + public static void Example() |
| 11 | + { |
| 12 | + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, |
| 13 | + // as well as the source of randomness. |
| 14 | + var mlContext = new MLContext(); |
| 15 | + |
| 16 | + // Create a small dataset as an IEnumerable. |
| 17 | + var samples = new List<TextData>() |
| 18 | + { |
| 19 | + new TextData(){ Text = "ML.NET's FeaturizeText API uses a composition of several basic transforms to convert text into numeric features." }, |
| 20 | + new TextData(){ Text = "This API can be used as a featurizer to perform text classification." }, |
| 21 | + new TextData(){ Text = "There are a number of approaches to text classification." }, |
| 22 | + new TextData(){ Text = "One of the simplest and most common approaches is called “Bag of Words”." }, |
| 23 | + new TextData(){ Text = "Text classification can be used for a wide variety of tasks" }, |
| 24 | + new TextData(){ Text = "such as sentiment analysis, topic detection, intent identification etc." }, |
| 25 | + }; |
| 26 | + |
| 27 | + // Convert training data to IDataView. |
| 28 | + var dataview = mlContext.Data.LoadFromEnumerable(samples); |
| 29 | + |
| 30 | + // A pipeline for converting text into numeric features. |
| 31 | + // The following call to 'FeaturizeText' instantiates 'TextFeaturizingEstimator' with given parameters. |
| 32 | + // The length of the output feature vector depends on these settings. |
| 33 | + var options = new TextFeaturizingEstimator.Options() |
| 34 | + { |
| 35 | + // Also output tokenized words |
| 36 | + OutputTokensColumnName = "OutputTokens", |
| 37 | + CaseMode = TextNormalizingEstimator.CaseMode.Lower, |
| 38 | + // Use ML.NET's built-in stop word remover |
| 39 | + StopWordsRemoverOptions = new StopWordsRemovingEstimator.Options() { Language = TextFeaturizingEstimator.Language.English }, |
| 40 | + WordFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 2, UseAllLengths = true }, |
| 41 | + CharFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 3, UseAllLengths= false }, |
| 42 | + }; |
| 43 | + var textPipeline = mlContext.Transforms.Text.FeaturizeText("Features", options, "Text"); |
| 44 | + |
| 45 | + // Fit to data. |
| 46 | + var textTransformer = textPipeline.Fit(dataview); |
| 47 | + |
| 48 | + // Create the prediction engine to get the features extracted from the text. |
| 49 | + var predictionEngine = mlContext.Model.CreatePredictionEngine<TextData, TransformedTextData>(textTransformer); |
| 50 | + |
| 51 | + // Convert the text into numeric features. |
| 52 | + var prediction = predictionEngine.Predict(samples[0]); |
| 53 | + |
| 54 | + // Print the length of the feature vector. |
| 55 | + Console.WriteLine($"Number of Features: {prediction.Features.Length}"); |
| 56 | + |
| 57 | + // Print feature values and tokens. |
| 58 | + Console.Write("Features: "); |
| 59 | + for (int i = 0; i < 10; i++) |
| 60 | + Console.Write($"{prediction.Features[i]:F4} "); |
| 61 | + |
| 62 | + Console.WriteLine($"\nTokens: {string.Join(",", prediction.OutputTokens)}"); |
| 63 | + |
| 64 | + // Expected output: |
| 65 | + // Number of Features: 282 |
| 66 | + // Features: 0.0941 0.0941 0.0941 0.0941 0.0941 0.0941 0.0941 0.0941 0.0941 0.1881 ... |
| 67 | + // Tokens: ml.net's,featurizetext,api,uses,composition,basic,transforms,convert,text,numeric,features. |
| 68 | + } |
| 69 | + |
| 70 | + public class TextData |
| 71 | + { |
| 72 | + public string Text { get; set; } |
| 73 | + } |
| 74 | + |
| 75 | + public class TransformedTextData : TextData |
| 76 | + { |
| 77 | + public float[] Features { get; set; } |
| 78 | + public string[] OutputTokens { get; set; } |
| 79 | + } |
| 80 | + } |
| 81 | +} |
0 commit comments