Polish word-level tokenizers

wschin · wschin · commit 145bf74d530b · 2019-03-11T16:09:29.000-07:00
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValueValueToKey.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValueValueToKey.cs
@@ -30,15 +30,15 @@ public static void Example()
             // making use of default settings.
             string defaultColumnName = "DefaultKeys";
             // REVIEW create through the catalog extension
-            var default_pipeline = ml.Transforms.Text.TokenizeWords("Review")
+            var default_pipeline = ml.Transforms.Text.ProduceWordTokens("Review")
                 .Append(ml.Transforms.Conversion.MapValueToKey(defaultColumnName, "Review"));
 
             // Another pipeline, that customizes the advanced settings of the ValueToKeyMappingEstimator.
             // We can change the maxNumTerm to limit how many keys will get generated out of the set of words, 
             // and condition the order in which they get evaluated by changing sort from the default Occurence (order in which they get encountered) 
             // to value/alphabetically.
             string customizedColumnName = "CustomizedKeys";
-            var customized_pipeline = ml.Transforms.Text.TokenizeWords("Review")
+            var customized_pipeline = ml.Transforms.Text.ProduceWordTokens("Review")
                 .Append(ml.Transforms.Conversion.MapValueToKey(customizedColumnName, "Review", maxNumKeys: 10, sort: ValueToKeyMappingEstimator.SortOrder.Value));
 
             // The transformed data.
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/StopWordRemoverTransform.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/StopWordRemoverTransform.cs
@@ -25,7 +25,7 @@ public static void Example()
 
             // Let's take SentimentText column and break it into vector of words.
             string originalTextColumnName = "Words";
-            var words = ml.Transforms.Text.TokenizeWords("SentimentText", originalTextColumnName);
+            var words = ml.Transforms.Text.ProduceWordTokens("SentimentText", originalTextColumnName);
 
             // Default pipeline will apply default stop word remover which is based on predifined set of words for certain languages.
             var defaultPipeline = words.Append(ml.Transforms.Text.RemoveDefaultStopWords(originalTextColumnName, "DefaultRemover"));
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/TensorFlow/TextClassification.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/TensorFlow/TextClassification.cs
@@ -68,7 +68,7 @@ public static void Example()
                 j.Features = features;
             };
 
-            var engine = mlContext.Transforms.Text.TokenizeWords("TokenizedWords", "Sentiment_Text")
+            var engine = mlContext.Transforms.Text.ProduceWordTokens("TokenizedWords", "Sentiment_Text")
                 .Append(mlContext.Transforms.Conversion.ValueMap(lookupMap, "Words", "Ids", new ColumnOptions[] { ("VariableLenghtFeatures", "TokenizedWords") }))
                 .Append(mlContext.Transforms.CustomMapping(ResizeFeaturesAction, "Resize"))
                 .Append(tensorFlowModel.ScoreTensorFlowModel(new[] { "Prediction/Softmax" }, new[] { "Features" }))
diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/WordEmbeddingTransform.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/WordEmbeddingTransform.cs
@@ -26,7 +26,7 @@ public static void Example()
 
             // Pipeline which goes through SentimentText and normalizes it, tokenize it by words, and removes default stopwords.
             var wordsPipeline = ml.Transforms.Text.NormalizeText("NormalizedText", "SentimentText", keepDiacritics: false, keepPunctuations: false)
-                .Append(ml.Transforms.Text.TokenizeWords("Words", "NormalizedText"))
+                .Append(ml.Transforms.Text.ProduceWordTokens("Words", "NormalizedText"))
                 .Append(ml.Transforms.Text.RemoveDefaultStopWords("CleanWords", "Words"));
 
             var wordsDataview = wordsPipeline.Fit(trainData).Transform(trainData);
diff --git a/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs b/src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs
@@ -55,7 +55,7 @@ public override IEstimator<ITransformer> Reconcile(IHostEnvironment env,
         /// </summary>
         /// <param name="input">The column to apply to.</param>
         /// <param name="separators">The separators to use (uses space character by default).</param>
-        public static VarVector<string> TokenizeText(this Scalar<string> input, char[] separators = null) => new OutPipelineColumn(input, separators);
+        public static VarVector<string> ProduceWordTokens(this Scalar<string> input, char[] separators = null) => new OutPipelineColumn(input, separators);
     }
 
     /// <summary>
diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs
@@ -160,29 +160,18 @@ public static WordEmbeddingEstimator ApplyWordEmbedding(this TransformsCatalog.T
         /// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
         /// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
         /// <param name="separators">The separators to use (uses space character by default).</param>
-        public static WordTokenizingEstimator TokenizeWords(this TransformsCatalog.TextTransforms catalog,
+        public static WordTokenizingEstimator ProduceWordTokens(this TransformsCatalog.TextTransforms catalog,
             string outputColumnName,
             string inputColumnName = null,
             char[] separators = null)
             => new WordTokenizingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, inputColumnName, separators);
 
-        /// <summary>
-        /// Tokenizes incoming text in input columns and outputs the tokens using <paramref name="separators"/> as separators.
-        /// </summary>
-        /// <param name="catalog">The text-related transform's catalog.</param>
-        /// <param name="columns">Pairs of columns to run the tokenization on.</param>
-        /// <param name="separators">The separators to use (uses space character by default).</param>
-        public static WordTokenizingEstimator TokenizeWords(this TransformsCatalog.TextTransforms catalog,
-            (string outputColumnName, string inputColumnName)[] columns,
-            char[] separators = null)
-            => new WordTokenizingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), columns, separators);
-
         /// <summary>
         ///  Tokenizes incoming text in input columns, using per-column configurations, and outputs the tokens.
         /// </summary>
         /// <param name="catalog">The text-related transform's catalog.</param>
         /// <param name="columns">Pairs of columns to run the tokenization on.</param>
-        public static WordTokenizingEstimator TokenizeWords(this TransformsCatalog.TextTransforms catalog,
+        public static WordTokenizingEstimator ProduceWordTokens(this TransformsCatalog.TextTransforms catalog,
             params WordTokenizingEstimator.ColumnOptions[] columns)
           => new WordTokenizingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), columns);
 
diff --git a/src/Microsoft.ML.Transforms/Text/WordTokenizing.cs b/src/Microsoft.ML.Transforms/Text/WordTokenizing.cs
@@ -441,9 +441,21 @@ internal WordTokenizingEstimator(IHostEnvironment env, params ColumnOptions[] co
         }
         public sealed class ColumnOptions
         {
+            /// <summary>
+            /// Output column name that will be used to store the tokenization result of <see cref="InputColumnName"/> column.
+            /// </summary>
             public readonly string Name;
+            /// <summary>
+            /// Input column name that will be tokenized into words.
+            /// </summary>
             public readonly string InputColumnName;
+            /// <summary>
+            /// Seperator list used to tokenize input string. If not specified, space will be used.
+            /// </summary>
             public IReadOnlyList<char> Separators => SeparatorsArray;
+            /// <summary>
+            /// State of <see cref="Separators"/>. Since <see langword="char"/>[] is multable, it's not safe to directly expose this field to users.
+            /// </summary>
             internal readonly char[] SeparatorsArray;
 
             /// <summary>
diff --git a/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs b/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs
@@ -519,7 +519,7 @@ public void Tokenize()
             var est = data.MakeNewEstimator()
                 .Append(r => (
                     r.label,
-                    tokens: r.text.TokenizeText(),
+                    tokens: r.text.ProduceWordTokens(),
                     chars: r.text.ProduceCharacterTokens()));
 
             var tdata = est.Fit(data).Transform(data);
@@ -547,7 +547,7 @@ public void NormalizeTextAndRemoveStopWords()
                 .Append(r => (
                     r.label,
                     normalized_text: r.text.NormalizeText(),
-                    words_without_stopwords: r.text.TokenizeText().RemoveStopwords()));
+                    words_without_stopwords: r.text.ProduceWordTokens().RemoveStopwords()));
 
             var tdata = est.Fit(data).Transform(data);
             var schema = tdata.AsDynamic.Schema;
@@ -604,8 +604,8 @@ public void Ngrams()
             var est = data.MakeNewEstimator()
                 .Append(r => (
                     r.label,
-                    ngrams: r.text.TokenizeText().ToKey().ToNgrams(),
-                    ngramshash: r.text.TokenizeText().ToKey().ToNgramsHash()));
+                    ngrams: r.text.ProduceWordTokens().ToKey().ToNgrams(),
+                    ngramshash: r.text.ProduceWordTokens().ToKey().ToNgramsHash()));
 
             var tdata = est.Fit(data).Transform(data);
             var schema = tdata.AsDynamic.Schema;
diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs
@@ -472,7 +472,7 @@ private void TextFeaturizationOn(string dataPath)
                     // NLP pipeline 4: word embeddings.
                     // PretrainedModelKind.Sswe is used here for performance of the test. In a real
                     // scenario, it is best to use a different model for more accuracy.
-                    Embeddings: r.Message.NormalizeText().TokenizeText().WordEmbeddings(WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding)
+                    Embeddings: r.Message.NormalizeText().ProduceWordTokens().WordEmbeddings(WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding)
                 ));
 
             // Let's train our pipeline, and then apply it to the same data.
diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs
@@ -312,7 +312,7 @@ private void TextFeaturizationOn(string dataPath)
                 // NLP pipeline 4: word embeddings.
                 // PretrainedModelKind.Sswe is used here for performance of the test. In a real
                 // scenario, it is best to use a different model for more accuracy.
-                .Append(mlContext.Transforms.Text.TokenizeWords("TokenizedMessage", "NormalizedMessage"))
+                .Append(mlContext.Transforms.Text.ProduceWordTokens("TokenizedMessage", "NormalizedMessage"))
                 .Append(mlContext.Transforms.Text.ApplyWordEmbedding("Embeddings", "TokenizedMessage",
                             WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding));
 
diff --git a/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs b/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs
@@ -984,7 +984,7 @@ public void TensorFlowSentimentClassificationTest()
             // The first pipeline 'dataPipe' tokenzies the string into words and maps each word to an integer which is an index in the dictionary.
             // Then this integer vector is retrieved from the pipeline and resized to fixed length.
             // The second pipeline 'tfEnginePipe' takes the resized integer vector and passes it to TensoFlow and gets the classification scores.
-            var estimator = mlContext.Transforms.Text.TokenizeWords("TokenizedWords", "Sentiment_Text")
+            var estimator = mlContext.Transforms.Text.ProduceWordTokens("TokenizedWords", "Sentiment_Text")
                 .Append(mlContext.Transforms.Conversion.ValueMap(lookupMap, "Words", "Ids", new ColumnOptions[] { ("Features", "TokenizedWords") }));
             var dataPipe = estimator.Fit(dataView)
                 .CreatePredictionEngine<TensorFlowSentiment, TensorFlowSentiment>(mlContext);
diff --git a/test/Microsoft.ML.Tests/Transformers/CategoricalHashTests.cs b/test/Microsoft.ML.Tests/Transformers/CategoricalHashTests.cs
@@ -76,7 +76,7 @@ public void CategoricalHashStatic()
                       row.ScalarString,
                       row.VectorString,
                       // Create a VarVector column
-                      VarVectorString: row.ScalarString.TokenizeText())).
+                      VarVectorString: row.ScalarString.ProduceWordTokens())).
                   Append(row => (
                       A: row.ScalarString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashScalarOutputKind.Ind),
                       B: row.VectorString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashVectorOutputKind.Ind),
diff --git a/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs b/test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs
@@ -143,7 +143,7 @@ public void TextNormalizationAndStopwordRemoverWorkout()
                     text: ctx.LoadFloat(1)), hasHeader: true)
                 .Load(sentimentDataPath);
             var est = ML.Transforms.Text.NormalizeText("text")
-                .Append(ML.Transforms.Text.TokenizeWords("words", "text"))
+                .Append(ML.Transforms.Text.ProduceWordTokens("words", "text"))
                 .Append(ML.Transforms.Text.RemoveDefaultStopWords("NoDefaultStopwords", "words"))
                 .Append(ML.Transforms.Text.RemoveStopWords("NoStopWords", "words", "xbox", "this", "is", "a", "the", "THAT", "bY"));
 
diff --git a/test/Microsoft.ML.Tests/Transformers/ValueMappingTests.cs b/test/Microsoft.ML.Tests/Transformers/ValueMappingTests.cs
@@ -546,7 +546,7 @@ public void ValueMappingInputIsVectorWorkout()
             var keys = new List<ReadOnlyMemory<char>>() { "foo".AsMemory(), "bar".AsMemory(), "test".AsMemory(), "wahoo".AsMemory() };
             var values = new List<int>() { 1, 2, 3, 4 };
 
-            var est = ML.Transforms.Text.TokenizeWords("TokenizeB", "B")
+            var est = ML.Transforms.Text.ProduceWordTokens("TokenizeB", "B")
                 .Append(ML.Transforms.Conversion.ValueMap(keys, values, new ColumnOptions[] { ("VecB", "TokenizeB") }));
             TestEstimatorCore(est, validFitInput: dataView, invalidInput: badDataView);
         }
diff --git a/test/Microsoft.ML.Tests/Transformers/WordEmbeddingsTests.cs b/test/Microsoft.ML.Tests/Transformers/WordEmbeddingsTests.cs
@@ -35,7 +35,7 @@ public void TestWordEmbeddings()
                    }).Load(GetDataPath(dataPath));
 
             var est = ML.Transforms.Text.NormalizeText("NormalizedText", "SentimentText", keepDiacritics: false, keepPunctuations: false)
-                  .Append(ML.Transforms.Text.TokenizeWords("Words", "NormalizedText"))
+                  .Append(ML.Transforms.Text.ProduceWordTokens("Words", "NormalizedText"))
                   .Append(ML.Transforms.Text.RemoveDefaultStopWords("CleanWords", "Words"));
             var words = est.Fit(data).Transform(data);
 
@@ -70,7 +70,7 @@ public void TestCustomWordEmbeddings()
                    }).Load(GetDataPath(dataPath));
 
             var est = ML.Transforms.Text.NormalizeText("NormalizedText", "SentimentText", keepDiacritics: false, keepPunctuations: false)
-                  .Append(ML.Transforms.Text.TokenizeWords("Words", "NormalizedText"))
+                  .Append(ML.Transforms.Text.ProduceWordTokens("Words", "NormalizedText"))
                   .Append(ML.Transforms.Text.RemoveDefaultStopWords("CleanWords", "Words"));
             var words = est.Fit(data).Transform(data);
             var pathToCustomModel = DeleteOutputPath("custommodel.txt");

Original file line number	Diff line number	Diff line change
`@@ -55,7 +55,7 @@ public override IEstimator<ITransformer> Reconcile(IHostEnvironment env,`
`55`	`55`	`/// </summary>`
`56`	`56`	`/// <param name="input">The column to apply to.</param>`
`57`	`57`	`/// <param name="separators">The separators to use (uses space character by default).</param>`
`58`		`- public static VarVector<string> TokenizeText(this Scalar<string> input, char[] separators = null) => new OutPipelineColumn(input, separators);`
	`58`	`+ public static VarVector<string> ProduceWordTokens(this Scalar<string> input, char[] separators = null) => new OutPipelineColumn(input, separators);`
`59`	`59`	`}`
`60`	`60`
`61`	`61`	`/// <summary>`
Original file line number	Diff line number	Diff line change
`@@ -546,7 +546,7 @@ public void ValueMappingInputIsVectorWorkout()`
`546`	`546`	`var keys = new List<ReadOnlyMemory<char>>() { "foo".AsMemory(), "bar".AsMemory(), "test".AsMemory(), "wahoo".AsMemory() };`
`547`	`547`	`var values = new List<int>() { 1, 2, 3, 4 };`
`548`	`548`
`549`		`- var est = ML.Transforms.Text.TokenizeWords("TokenizeB", "B")`
	`549`	`+ var est = ML.Transforms.Text.ProduceWordTokens("TokenizeB", "B")`
`550`	`550`	`.Append(ML.Transforms.Conversion.ValueMap(keys, values, new ColumnOptions[] { ("VecB", "TokenizeB") }));`
`551`	`551`	`TestEstimatorCore(est, validFitInput: dataView, invalidInput: badDataView);`
`552`	`552`	`}`