Created samples for TokenizeIntoWords and RemoveStopWords APIs. (#3156)

zeahmed · web-flow · commit 950f133c2f8f · 2019-04-02T15:06:18.000-07:00
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/StopWordRemoverTransform.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/StopWordRemoverTransform.cs
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyCustomWordEmbedding.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyCustomWordEmbedding.cs
@@ -13,7 +13,7 @@ public static void Example()
             // as well as the source of randomness.
             var mlContext = new MLContext();
 
-            // Create an empty data sample list. The 'ApplyWordEmbedding' does not require training data as
+            // Create an empty list as the dataset. The 'ApplyWordEmbedding' does not require training data as
             // the estimator ('WordEmbeddingEstimator') created by 'ApplyWordEmbedding' API is not a trainable estimator.
             // The empty list is only needed to pass input schema to the pipeline.
             var emptySamples = new List<TextData>();
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyWordEmbedding.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyWordEmbedding.cs
@@ -12,7 +12,7 @@ public static void Example()
             // as well as the source of randomness.
             var mlContext = new MLContext();
 
-            // Create an empty data sample list. The 'ApplyWordEmbedding' does not require training data as
+            // Create an empty list as the dataset. The 'ApplyWordEmbedding' does not require training data as
             // the estimator ('WordEmbeddingEstimator') created by 'ApplyWordEmbedding' API is not a trainable estimator.
             // The empty list is only needed to pass input schema to the pipeline.
             var emptySamples = new List<TextData>();
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/NormalizeText.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/NormalizeText.cs
@@ -12,7 +12,7 @@ public static void Example()
             // as well as the source of randomness.
             var mlContext = new MLContext();
 
-            // Create an empty data sample list. The 'NormalizeText' API does not require training data as
+            // Create an empty list as the dataset. The 'NormalizeText' API does not require training data as
             // the estimator ('TextNormalizingEstimator') created by 'NormalizeText' API is not a trainable estimator.
             // The empty list is only needed to pass input schema to the pipeline.
             var emptySamples = new List<TextData>();
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/RemoveDefaultStopWords.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/RemoveDefaultStopWords.cs
@@ -0,0 +1,60 @@
+﻿using System;
+using System.Collections.Generic;
+using Microsoft.ML.Transforms.Text;
+
+namespace Microsoft.ML.Samples.Dynamic
+{
+    public static class RemoveDefaultStopWords
+    {
+        public static void Example()
+        {
+            // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, 
+            // as well as the source of randomness.
+            var mlContext = new MLContext();
+
+            // Create an empty list as the dataset. The 'RemoveDefaultStopWords' does not require training data as
+            // the estimator ('StopWordsRemovingEstimator') created by 'RemoveDefaultStopWords' API is not a trainable estimator.
+            // The empty list is only needed to pass input schema to the pipeline.
+            var emptySamples = new List<TextData>();
+
+            // Convert sample list to an empty IDataView.
+            var emptyDataView = mlContext.Data.LoadFromEnumerable(emptySamples);
+
+            // A pipeline for removing stop words from input text/string.
+            // The pipeline first tokenizes text into words then removes stop words.
+            // The 'RemoveDefaultStopWords' API ignores casing of the text/string e.g. 'tHe' and 'the' are considered the same stop words.
+            var textPipeline = mlContext.Transforms.Text.TokenizeIntoWords("Words", "Text")
+                .Append(mlContext.Transforms.Text.RemoveDefaultStopWords("WordsWithoutStopWords", "Words", language: StopWordsRemovingEstimator.Language.English));
+
+            // Fit to data.
+            var textTransformer = textPipeline.Fit(emptyDataView);
+
+            // Create the prediction engine to remove the stop words from the input text/string.
+            var predictionEngine = mlContext.Model.CreatePredictionEngine<TextData, TransformedTextData>(textTransformer);
+
+            // Call the prediction API to remove stop words.
+            var data = new TextData() { Text = "ML.NET's RemoveDefaultStopWords API removes stop words from tHe text/string. It requires the text/string to be tokenized beforehand." };
+            var prediction = predictionEngine.Predict(data);
+
+            // Print the length of the word vector after the stop words removed.
+            Console.WriteLine($"Number of words: {prediction.WordsWithoutStopWords.Length}");
+
+            // Print the word vector without stop words.
+            Console.WriteLine($"\nWords without stop words: {string.Join(",", prediction.WordsWithoutStopWords)}");
+
+            //  Expected output:
+            //   Number of words: 11
+            //   Words without stop words: ML.NET's,RemoveDefaultStopWords,API,removes,stop,words,text/string.,requires,text/string,tokenized,beforehand.
+        }
+
+        public class TextData
+        {
+            public string Text { get; set; }
+        }
+
+        public class TransformedTextData : TextData
+        {
+            public string[] WordsWithoutStopWords { get; set; }
+        }
+    }
+}
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/RemoveStopWords.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/RemoveStopWords.cs
@@ -0,0 +1,60 @@
+﻿using System;
+using System.Collections.Generic;
+using Microsoft.ML.Transforms.Text;
+
+namespace Microsoft.ML.Samples.Dynamic
+{
+    public static class RemoveStopWords
+    {
+        public static void Example()
+        {
+            // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, 
+            // as well as the source of randomness.
+            var mlContext = new MLContext();
+
+            // Create an empty list as the dataset. The 'RemoveStopWords' does not require training data as
+            // the estimator ('CustomStopWordsRemovingEstimator') created by 'RemoveStopWords' API is not a trainable estimator.
+            // The empty list is only needed to pass input schema to the pipeline.
+            var emptySamples = new List<TextData>();
+
+            // Convert sample list to an empty IDataView.
+            var emptyDataView = mlContext.Data.LoadFromEnumerable(emptySamples);
+
+            // A pipeline for removing stop words from input text/string.
+            // The pipeline first tokenizes text into words then removes stop words.
+            // The 'RemoveStopWords' API ignores casing of the text/string e.g. 'tHe' and 'the' are considered the same stop words.
+            var textPipeline = mlContext.Transforms.Text.TokenizeIntoWords("Words", "Text")
+                .Append(mlContext.Transforms.Text.RemoveStopWords("WordsWithoutStopWords", "Words", stopwords: new[] { "a", "the", "from", "by" }));
+
+            // Fit to data.
+            var textTransformer = textPipeline.Fit(emptyDataView);
+
+            // Create the prediction engine to remove the stop words from the input text/string.
+            var predictionEngine = mlContext.Model.CreatePredictionEngine<TextData, TransformedTextData>(textTransformer);
+
+            // Call the prediction API to remove stop words.
+            var data = new TextData() { Text = "ML.NET's RemoveStopWords API removes stop words from tHe text/string using a list of stop words provided by the user." };
+            var prediction = predictionEngine.Predict(data);
+
+            // Print the length of the word vector after the stop words removed.
+            Console.WriteLine($"Number of words: {prediction.WordsWithoutStopWords.Length}");
+
+            // Print the word vector without stop words.
+            Console.WriteLine($"\nWords without stop words: {string.Join(",", prediction.WordsWithoutStopWords)}");
+
+            //  Expected output:
+            //   Number of words: 14
+            //   Words without stop words: ML.NET's,RemoveStopWords,API,removes,stop,words,text/string,using,list,of,stop,words,provided,user.
+        }
+
+        public class TextData
+        {
+            public string Text { get; set; }
+        }
+
+        public class TransformedTextData : TextData
+        {
+            public string[] WordsWithoutStopWords { get; set; }
+        }
+    }
+}
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/TokenizeIntoCharactersAsKeys.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/TokenizeIntoCharactersAsKeys.cs
@@ -12,7 +12,7 @@ public static void Example()
             // as well as the source of randomness.
             var mlContext = new MLContext();
 
-            // Create an empty data sample list. The 'TokenizeIntoCharactersAsKeys' does not require training data as
+            // Create an empty list as the dataset. The 'TokenizeIntoCharactersAsKeys' does not require training data as
             // the estimator ('TokenizingByCharactersEstimator') created by 'TokenizeIntoCharactersAsKeys' API is not a trainable estimator.
             // The empty list is only needed to pass input schema to the pipeline.
             var emptySamples = new List<TextData>();
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/TokenizeIntoWords.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/TokenizeIntoWords.cs
@@ -0,0 +1,59 @@
+﻿using System;
+using System.Collections.Generic;
+using System.Text;
+
+namespace Microsoft.ML.Samples.Dynamic
+{
+    public static class TokenizeIntoWords
+    {
+        public static void Example()
+        {
+            // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, 
+            // as well as the source of randomness.
+            var mlContext = new MLContext();
+
+            // Create an empty list as the dataset. The 'TokenizeIntoWords' does not require training data as
+            // the estimator ('WordTokenizingEstimator') created by 'TokenizeIntoWords' API is not a trainable estimator.
+            // The empty list is only needed to pass input schema to the pipeline.
+            var emptySamples = new List<TextData>();
+
+            // Convert sample list to an empty IDataView.
+            var emptyDataView = mlContext.Data.LoadFromEnumerable(emptySamples);
+
+            // A pipeline for converting text into vector of words.
+            // The following call to 'TokenizeIntoWords' tokenizes text/string into words using space as a separator.
+            // Space is also a default value for the 'separators' argument if it is not specified.
+            var textPipeline = mlContext.Transforms.Text.TokenizeIntoWords("Words", "Text", separators: new[] { ' ' });
+
+            // Fit to data.
+            var textTransformer = textPipeline.Fit(emptyDataView);
+
+            // Create the prediction engine to get the word vector from the input text/string.
+            var predictionEngine = mlContext.Model.CreatePredictionEngine<TextData, TransformedTextData>(textTransformer);
+
+            // Call the prediction API to convert the text into words.
+            var data = new TextData() { Text = "ML.NET's TokenizeIntoWords API splits text/string into words using the list of characters provided as separators." };
+            var prediction = predictionEngine.Predict(data);
+
+            // Print the length of the word vector.
+            Console.WriteLine($"Number of words: {prediction.Words.Length}");
+
+            // Print the word vector.
+            Console.WriteLine($"\nWords: {string.Join(",", prediction.Words)}");
+
+            //  Expected output:
+            //   Number of words: 15
+            //   Words: ML.NET's,TokenizeIntoWords,API,splits,text/string,into,words,using,the,list,of,characters,provided,as,separators.
+        }
+
+        public class TextData
+        {
+            public string Text { get; set; }
+        }
+
+        public class TransformedTextData : TextData
+        {
+            public string[] Words { get; set; }
+        }
+    }
+}
diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs
@@ -186,6 +186,13 @@ internal static WordEmbeddingEstimator ApplyWordEmbedding(this TransformsCatalog
         /// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
         /// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
         /// <param name="separators">The separators to use (uses space character by default).</param>
+        /// <example>
+        /// <format type="text/markdown">
+        /// <![CDATA[
+        /// [!code-csharp[TokenizeIntoWords](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/TokenizeIntoWords.cs)]
+        /// ]]>
+        /// </format>
+        /// </example>
         public static WordTokenizingEstimator TokenizeIntoWords(this TransformsCatalog.TextTransforms catalog,
             string outputColumnName,
             string inputColumnName = null,
@@ -254,8 +261,9 @@ internal static NgramExtractingEstimator ProduceNgrams(this TransformsCatalog.Te
         /// <example>
         /// <format type="text/markdown">
         /// <![CDATA[
-        ///  [!code-csharp[FastTree](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/StopWordRemoverTransform.cs)]
-        /// ]]></format>
+        /// [!code-csharp[RemoveDefaultStopWords](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/RemoveDefaultStopWords.cs)]
+        /// ]]>
+        /// </format>
         /// </example>
         public static StopWordsRemovingEstimator RemoveDefaultStopWords(this TransformsCatalog.TextTransforms catalog,
             string outputColumnName,
@@ -274,8 +282,9 @@ public static StopWordsRemovingEstimator RemoveDefaultStopWords(this TransformsC
         /// <example>
         /// <format type="text/markdown">
         /// <![CDATA[
-        ///  [!code-csharp[FastTree](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/StopWordRemoverTransform.cs)]
-        /// ]]></format>
+        /// [!code-csharp[RemoveStopWords](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/RemoveStopWords.cs)]
+        /// ]]>
+        /// </format>
         /// </example>
         public static CustomStopWordsRemovingEstimator RemoveStopWords(this TransformsCatalog.TextTransforms catalog,
             string outputColumnName,