Created samples for 'FeaturizeText' API. (#3120)

zeahmed · web-flow · commit 233bc2d6b1cf · 2019-03-28T15:05:22.000-07:00
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/FeaturizeText.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/FeaturizeText.cs
@@ -0,0 +1,73 @@
+﻿using System;
+using System.Collections.Generic;
+using Microsoft.ML.Data;
+
+namespace Microsoft.ML.Samples.Dynamic
+{
+    public static class FeaturizeText
+    {
+        public static void Example()
+        {
+            // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, 
+            // as well as the source of randomness.
+            var mlContext = new MLContext();
+
+            // Create a small dataset as an IEnumerable.
+            var samples = new List<TextData>()
+            {
+                new TextData(){ Text = "ML.NET's FeaturizeText API uses a composition of several basic transforms to convert text into numeric features." },
+                new TextData(){ Text = "This API can be used as a featurizer to perform text classification." },
+                new TextData(){ Text = "There are a number of approaches to text classification." },
+                new TextData(){ Text = "One of the simplest and most common approaches is called “Bag of Words”." },
+                new TextData(){ Text = "Text classification can be used for a wide variety of tasks" },
+                new TextData(){ Text = "such as sentiment analysis, topic detection, intent identification etc." },
+            };
+
+            // Convert training data to IDataView.
+            var dataview = mlContext.Data.LoadFromEnumerable(samples);
+
+            // A pipeline for converting text into numeric features.
+            // The following call to 'FeaturizeText' instantiates 'TextFeaturizingEstimator' with default parameters.
+            // The default settings for the TextFeaturizingEstimator are
+            //      * StopWordsRemover: None
+            //      * CaseMode: Lowercase
+            //      * OutputTokensColumnName: None
+            //      * KeepDiacritics: false, KeepPunctuations: true, KeepNumbers: true
+            //      * WordFeatureExtractor: NgramLength = 1
+            //      * CharFeatureExtractor: NgramLength = 3, UseAllLengths = false
+            // The length of the output feature vector depends on these settings.
+            var textPipeline = mlContext.Transforms.Text.FeaturizeText("Features", "Text");
+            
+            // Fit to data.
+            var textTransformer = textPipeline.Fit(dataview);
+
+            // Create the prediction engine to get the features extracted from the text.
+            var predictionEngine = mlContext.Model.CreatePredictionEngine<TextData, TransformedTextData>(textTransformer);
+
+            // Convert the text into numeric features.
+            var prediction = predictionEngine.Predict(samples[0]);
+
+            // Print the length of the feature vector.
+            Console.WriteLine($"Number of Features: {prediction.Features.Length}");
+
+            // Print the first 10 feature values.
+            Console.Write("Features: ");
+            for (int i = 0; i < 10; i++)
+                Console.Write($"{prediction.Features[i]:F4}  ");
+
+            //  Expected output:
+            //   Number of Features: 332
+            //   Features: 0.0857  0.0857  0.0857  0.0857  0.0857  0.0857  0.0857  0.0857  0.0857  0.1715 ...
+        }
+
+        public class TextData
+        {
+            public string Text { get; set; }
+        }
+
+        public class TransformedTextData : TextData
+        {
+            public float[] Features { get; set; }
+        }
+    }
+}
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/FeaturizeTextWithOptions.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/FeaturizeTextWithOptions.cs
@@ -0,0 +1,81 @@
+﻿using System;
+using System.Collections.Generic;
+using Microsoft.ML.Data;
+using Microsoft.ML.Transforms.Text;
+
+namespace Microsoft.ML.Samples.Dynamic
+{
+    public static class FeaturizeTextWithOptions
+    {
+        public static void Example()
+        {
+            // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, 
+            // as well as the source of randomness.
+            var mlContext = new MLContext();
+
+            // Create a small dataset as an IEnumerable.
+            var samples = new List<TextData>()
+            {
+                new TextData(){ Text = "ML.NET's FeaturizeText API uses a composition of several basic transforms to convert text into numeric features." },
+                new TextData(){ Text = "This API can be used as a featurizer to perform text classification." },
+                new TextData(){ Text = "There are a number of approaches to text classification." },
+                new TextData(){ Text = "One of the simplest and most common approaches is called “Bag of Words”." },
+                new TextData(){ Text = "Text classification can be used for a wide variety of tasks" },
+                new TextData(){ Text = "such as sentiment analysis, topic detection, intent identification etc." },
+            };
+
+            // Convert training data to IDataView.
+            var dataview = mlContext.Data.LoadFromEnumerable(samples);
+
+            // A pipeline for converting text into numeric features.
+            // The following call to 'FeaturizeText' instantiates 'TextFeaturizingEstimator' with given parameters.
+            // The length of the output feature vector depends on these settings.
+            var options = new TextFeaturizingEstimator.Options()
+            {
+                // Also output tokenized words
+                OutputTokensColumnName = "OutputTokens",
+                CaseMode = TextNormalizingEstimator.CaseMode.Lower,
+                // Use ML.NET's built-in stop word remover
+                StopWordsRemoverOptions = new StopWordsRemovingEstimator.Options() { Language = TextFeaturizingEstimator.Language.English },
+                WordFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 2, UseAllLengths = true },
+                CharFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 3, UseAllLengths= false },
+            };
+            var textPipeline = mlContext.Transforms.Text.FeaturizeText("Features", options, "Text");
+
+            // Fit to data.
+            var textTransformer = textPipeline.Fit(dataview);
+
+            // Create the prediction engine to get the features extracted from the text.
+            var predictionEngine = mlContext.Model.CreatePredictionEngine<TextData, TransformedTextData>(textTransformer);
+
+            // Convert the text into numeric features.
+            var prediction = predictionEngine.Predict(samples[0]);
+
+            // Print the length of the feature vector.
+            Console.WriteLine($"Number of Features: {prediction.Features.Length}");
+
+            // Print feature values and tokens.
+            Console.Write("Features: ");
+            for (int i = 0; i < 10; i++)
+                Console.Write($"{prediction.Features[i]:F4}  ");
+
+            Console.WriteLine($"\nTokens: {string.Join(",", prediction.OutputTokens)}");
+
+            //  Expected output:
+            //   Number of Features: 282
+            //   Features: 0.0941  0.0941  0.0941  0.0941  0.0941  0.0941  0.0941  0.0941  0.0941  0.1881 ...
+            //   Tokens: ml.net's,featurizetext,api,uses,composition,basic,transforms,convert,text,numeric,features.
+        }
+
+        public class TextData
+        {
+            public string Text { get; set; }
+        }
+
+        public class TransformedTextData : TextData
+        {
+            public float[] Features { get; set; }
+            public string[] OutputTokens { get; set; }
+        }
+    }
+}
diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs
@@ -22,6 +22,13 @@ public static class TextCatalog
         /// <param name="catalog">The text-related transform's catalog.</param>
         /// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
         /// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
+        /// <example>
+        /// <format type="text/markdown">
+        /// <![CDATA[
+        /// [!code-csharp[FeaturizeText](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/FeaturizeText.cs)]
+        /// ]]>
+        /// </format>
+        /// </example>
         public static TextFeaturizingEstimator FeaturizeText(this TransformsCatalog.TextTransforms catalog,
             string outputColumnName,
             string inputColumnName = null)
@@ -38,7 +45,7 @@ public static TextFeaturizingEstimator FeaturizeText(this TransformsCatalog.Text
         /// <example>
         /// <format type="text/markdown">
         /// <![CDATA[
-        /// [!code-csharp[FeaturizeText](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/TextTransform.cs)]
+        /// [!code-csharp[FeaturizeText](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/FeaturizeTextWithOptions.cs)]
         /// ]]>
         /// </format>
         /// </example>