From ea847a9fcc349b28448235946bf83193aff3051d Mon Sep 17 00:00:00 2001 From: Ivan Matantsev Date: Thu, 18 Apr 2019 12:57:05 -0700 Subject: [PATCH 1/7] Update documentation for stopwords --- .../Text/StopWordsRemovingTransformer.cs | 51 ++++++++++++++----- .../Text/TextCatalog.cs | 24 +++++---- 2 files changed, 52 insertions(+), 23 deletions(-) diff --git a/src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs b/src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs index e2c6cac925..707842cb68 100644 --- a/src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs +++ b/src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs @@ -61,10 +61,7 @@ public ITransformer CreateComponent(IHostEnvironment env, IDataView input, OneTo } /// - /// A Stopword remover transform based on language-specific lists of stop words (most common words) - /// from Office Named Entity Recognition project. - /// The transform is usually applied after tokenizing text, so it compares individual tokens - /// (case-insensitive comparison) to the stopwords. + /// resulting from fitting an . /// public sealed class StopWordsRemovingTransformer : OneToOneTransformerBase { @@ -483,10 +480,25 @@ private protected override Func GetDependenciesCore(Func a } /// - /// Stopword remover removes language-specific list of stop words (most common words) - /// This is usually applied after tokenizing text, so it compares individual tokens - /// (case-insensitive comparison) to the stopwords. + /// for the . /// + /// + /// creates a new column, named as specified in the output column name parameters, and + /// fills it with vector of strings similar to vector of strings in input column but removing all, predefined for certain language, strings from it. + /// All strings comparison made by casting predefined strings and strings from input column to lower case using casing rules of invariant culture. + /// See the See Also section for links to examples of the usage. + /// ]]> + /// + /// + /// public sealed class StopWordsRemovingEstimator : TrivialEstimator { /// @@ -627,9 +639,7 @@ public override SchemaShape GetOutputSchema(SchemaShape inputSchema) } /// - /// Custom stopword remover removes specified list of stop words. - /// This is usually applied after tokenizing text, so it compares individual tokens - /// (case-insensitive comparison) to the stopwords. + /// resulting from fitting an . /// public sealed class CustomStopWordsRemovingTransformer : OneToOneTransformerBase { @@ -1076,10 +1086,25 @@ protected override Delegate MakeGetter(DataViewRow input, int iinfo, Func - /// Custom stopword remover removes specified list of stop words. - /// This is usually applied after tokenizing text, so it compares individual tokens - /// (case-insensitive comparison) to the stopwords. + /// for the . /// + /// + /// creates a new column, named as specified in the output column name parameters, and + /// fills it with vector of strings similar to vector of strings in input column but removing all specified strings from it. + /// All strings comparison made by casting specified strings and strings from input column to lower case using casing rules of invariant culture. + /// See the See Also section for links to examples of the usage. + /// ]]> + /// + /// + /// public sealed class CustomStopWordsRemovingEstimator : TrivialEstimator { /// diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs index 3d6402ff1d..d374be15d1 100644 --- a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs +++ b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs @@ -251,12 +251,14 @@ internal static NgramExtractingEstimator ProduceNgrams(this TransformsCatalog.Te => new NgramExtractingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), columns); /// - /// Removes stop words from incoming token streams in - /// and outputs the token streams without stopwords as . + /// Create a , which copies the data from the column specified in + /// to a new column: and removes predifined set of strings specific for from it. /// - /// The text-related transform's catalog. - /// The column containing output text. Null means is replaced. - /// The column containing text to remove stop words on. + /// The transform's catalog. + /// Name of the column resulting from the transformation of . + /// This column's data type will be unknown size vector of strings. + /// Name of the column to copy the data from. + /// This estimator operates over vector of strings. /// Langauge of the input text column . /// /// @@ -272,12 +274,14 @@ public static StopWordsRemovingEstimator RemoveDefaultStopWords(this TransformsC => new StopWordsRemovingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, inputColumnName, language); /// - /// Removes stop words from incoming token streams in - /// and outputs the token streams without stopwords as . + /// Create a , which copies the data from the column specified in + /// to a new column: and removes specified strings in from it. /// - /// The text-related transform's catalog. - /// The column containing output text. Null means is replaced. - /// The column containing text to remove stop words on. + /// The transform's catalog. + /// Name of the column resulting from the transformation of . + /// This column's data type will be unknown size vector of strings. + /// Name of the column to copy the data from. + /// This estimator operates over vector of strings. /// Array of words to remove. /// /// From 7e1016c5814a4c2a4e127650eabdcb1139b7c4c7 Mon Sep 17 00:00:00 2001 From: Ivan Matantsev Date: Thu, 18 Apr 2019 14:42:55 -0700 Subject: [PATCH 2/7] string to text --- .../Text/StopWordsRemovingTransformer.cs | 16 ++++++++-------- src/Microsoft.ML.Transforms/Text/TextCatalog.cs | 12 ++++++------ 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs b/src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs index 707842cb68..8ae38c7086 100644 --- a/src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs +++ b/src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs @@ -61,7 +61,7 @@ public ITransformer CreateComponent(IHostEnvironment env, IDataView input, OneTo } /// - /// resulting from fitting an . + /// resulting from fitting a . /// public sealed class StopWordsRemovingTransformer : OneToOneTransformerBase { @@ -488,12 +488,12 @@ private protected override Func GetDependenciesCore(Func a /// | | | /// | -- | -- | /// | Does this estimator need to look at the data to train its parameters? | No | - /// | Input column data type | Vector of TextDataViewType | - /// | Output column data type | Vector of unknown size of TextDataViewType | + /// | Input column data type | Vector of [Text] | + /// | Output column data type | Vector of unknown size of [Text] | /// /// The resulting creates a new column, named as specified in the output column name parameters, and /// fills it with vector of strings similar to vector of strings in input column but removing all, predefined for certain language, strings from it. - /// All strings comparison made by casting predefined strings and strings from input column to lower case using casing rules of invariant culture. + /// All text comparison made by casting predefined text and text from input column to lower case using casing rules of invariant culture. /// See the See Also section for links to examples of the usage. /// ]]> /// @@ -639,7 +639,7 @@ public override SchemaShape GetOutputSchema(SchemaShape inputSchema) } /// - /// resulting from fitting an . + /// resulting from fitting a . /// public sealed class CustomStopWordsRemovingTransformer : OneToOneTransformerBase { @@ -1094,12 +1094,12 @@ protected override Delegate MakeGetter(DataViewRow input, int iinfo, Func | + /// | Output column data type | Vector of unknown size of [Text] | /// /// The resulting creates a new column, named as specified in the output column name parameters, and /// fills it with vector of strings similar to vector of strings in input column but removing all specified strings from it. - /// All strings comparison made by casting specified strings and strings from input column to lower case using casing rules of invariant culture. + /// All text comparison made by casting specified text and text from input column to lower case using casing rules of invariant culture. /// See the See Also section for links to examples of the usage. /// ]]> /// diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs index d374be15d1..7abd6c15e6 100644 --- a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs +++ b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs @@ -252,13 +252,13 @@ internal static NgramExtractingEstimator ProduceNgrams(this TransformsCatalog.Te /// /// Create a , which copies the data from the column specified in - /// to a new column: and removes predifined set of strings specific for from it. + /// to a new column: and removes predifined set of text specific for from it. /// /// The transform's catalog. /// Name of the column resulting from the transformation of . - /// This column's data type will be unknown size vector of strings. + /// This column's data type will be unknown size vector of text. /// Name of the column to copy the data from. - /// This estimator operates over vector of strings. + /// This estimator operates over vector of text. /// Langauge of the input text column . /// /// @@ -275,13 +275,13 @@ public static StopWordsRemovingEstimator RemoveDefaultStopWords(this TransformsC /// /// Create a , which copies the data from the column specified in - /// to a new column: and removes specified strings in from it. + /// to a new column: and removes text specified in from it. /// /// The transform's catalog. /// Name of the column resulting from the transformation of . - /// This column's data type will be unknown size vector of strings. + /// This column's data type will be unknown size vector of text. /// Name of the column to copy the data from. - /// This estimator operates over vector of strings. + /// This estimator operates over vector of text. /// Array of words to remove. /// /// From 818ec4ae5a2d2254efb5148402a4528886e764b1 Mon Sep 17 00:00:00 2001 From: Ivan Matantsev Date: Thu, 18 Apr 2019 17:15:22 -0700 Subject: [PATCH 3/7] address comments --- .../Text/StopWordsRemovingTransformer.cs | 6 +++--- src/Microsoft.ML.Transforms/Text/TextCatalog.cs | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs b/src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs index 8ae38c7086..09b4f21cfc 100644 --- a/src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs +++ b/src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs @@ -489,10 +489,10 @@ private protected override Func GetDependenciesCore(Func a /// | -- | -- | /// | Does this estimator need to look at the data to train its parameters? | No | /// | Input column data type | Vector of [Text] | - /// | Output column data type | Vector of unknown size of [Text] | + /// | Output column data type | Unknown-sized vector of [Text] | /// /// The resulting creates a new column, named as specified in the output column name parameters, and - /// fills it with vector of strings similar to vector of strings in input column but removing all, predefined for certain language, strings from it. + /// fills it with vector of texts similar to vector of texts in input column but removing all, predefined for certain language, texts from it. /// All text comparison made by casting predefined text and text from input column to lower case using casing rules of invariant culture. /// See the See Also section for links to examples of the usage. /// ]]> @@ -1098,7 +1098,7 @@ protected override Delegate MakeGetter(DataViewRow input, int iinfo, Func | /// /// The resulting creates a new column, named as specified in the output column name parameters, and - /// fills it with vector of strings similar to vector of strings in input column but removing all specified strings from it. + /// fills it with vector of texts similar to vector of texts in input column but removing all specified texts from it. /// All text comparison made by casting specified text and text from input column to lower case using casing rules of invariant culture. /// See the See Also section for links to examples of the usage. /// ]]> diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs index 7abd6c15e6..341ff92f92 100644 --- a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs +++ b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs @@ -256,7 +256,7 @@ internal static NgramExtractingEstimator ProduceNgrams(this TransformsCatalog.Te /// /// The transform's catalog. /// Name of the column resulting from the transformation of . - /// This column's data type will be unknown size vector of text. + /// This column's data type will be unknown-size vector of text. /// Name of the column to copy the data from. /// This estimator operates over vector of text. /// Langauge of the input text column . @@ -279,7 +279,7 @@ public static StopWordsRemovingEstimator RemoveDefaultStopWords(this TransformsC /// /// The transform's catalog. /// Name of the column resulting from the transformation of . - /// This column's data type will be unknown size vector of text. + /// This column's data type will be unknown-size vector of text. /// Name of the column to copy the data from. /// This estimator operates over vector of text. /// Array of words to remove. From a8049d9f73c13ae93c8959bc2ef48d4a2383bff6 Mon Sep 17 00:00:00 2001 From: Ivan Matantsev Date: Fri, 19 Apr 2019 10:01:00 -0700 Subject: [PATCH 4/7] address comments --- .../Text/StopWordsRemovingTransformer.cs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs b/src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs index 09b4f21cfc..b048d74e71 100644 --- a/src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs +++ b/src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs @@ -492,7 +492,7 @@ private protected override Func GetDependenciesCore(Func a /// | Output column data type | Unknown-sized vector of [Text] | /// /// The resulting creates a new column, named as specified in the output column name parameters, and - /// fills it with vector of texts similar to vector of texts in input column but removing all, predefined for certain language, texts from it. + /// fills it with vector of words similar to vector of words in input column but removing all, predefined for certain language, texts from it. /// All text comparison made by casting predefined text and text from input column to lower case using casing rules of invariant culture. /// See the See Also section for links to examples of the usage. /// ]]> @@ -1095,11 +1095,11 @@ protected override Delegate MakeGetter(DataViewRow input, int iinfo, Func | - /// | Output column data type | Vector of unknown size of [Text] | + /// | Output column data type | Unknown-sized vector of [Text] | /// /// The resulting creates a new column, named as specified in the output column name parameters, and - /// fills it with vector of texts similar to vector of texts in input column but removing all specified texts from it. - /// All text comparison made by casting specified text and text from input column to lower case using casing rules of invariant culture. + /// fills it with vector of texts similar to vector of words in input column but removing all provided by user words from it. + /// All text comparison made by casting provided words and words from input column to lower case using casing rules of invariant culture. /// See the See Also section for links to examples of the usage. /// ]]> /// From f3f9003badad2504ddf23a9a42d1b521c76e016b Mon Sep 17 00:00:00 2001 From: Ivan Matantsev Date: Fri, 19 Apr 2019 12:44:20 -0700 Subject: [PATCH 5/7] ReadOnly to TextDataViewType --- .../Text/StopWordsRemovingTransformer.cs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs b/src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs index b048d74e71..1aa3070f74 100644 --- a/src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs +++ b/src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs @@ -488,8 +488,8 @@ private protected override Func GetDependenciesCore(Func a /// | | | /// | -- | -- | /// | Does this estimator need to look at the data to train its parameters? | No | - /// | Input column data type | Vector of [Text] | - /// | Output column data type | Unknown-sized vector of [Text] | + /// | Input column data type | Vector of [Text]() | + /// | Output column data type | Unknown-sized vector of [Text]() | /// /// The resulting creates a new column, named as specified in the output column name parameters, and /// fills it with vector of words similar to vector of words in input column but removing all, predefined for certain language, texts from it. @@ -1094,8 +1094,8 @@ protected override Delegate MakeGetter(DataViewRow input, int iinfo, Func | - /// | Output column data type | Unknown-sized vector of [Text] | + /// | Input column data type | Vector of [Text]() | + /// | Output column data type | Unknown-sized vector of [Text]() | /// /// The resulting creates a new column, named as specified in the output column name parameters, and /// fills it with vector of texts similar to vector of words in input column but removing all provided by user words from it. From 5c5fc26049fd6029e1df666c91b744d6d6c911d7 Mon Sep 17 00:00:00 2001 From: Ivan Matantsev Date: Fri, 19 Apr 2019 14:14:53 -0700 Subject: [PATCH 6/7] address comments --- .../Text/StopWordsRemovingTransformer.cs | 10 +++++----- src/Microsoft.ML.Transforms/Text/TextCatalog.cs | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs b/src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs index 1aa3070f74..c6eec1a2e0 100644 --- a/src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs +++ b/src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs @@ -489,10 +489,10 @@ private protected override Func GetDependenciesCore(Func a /// | -- | -- | /// | Does this estimator need to look at the data to train its parameters? | No | /// | Input column data type | Vector of [Text]() | - /// | Output column data type | Unknown-sized vector of [Text]() | + /// | Output column data type | Variable-sized vector of [Text]() | /// - /// The resulting creates a new column, named as specified in the output column name parameters, and - /// fills it with vector of words similar to vector of words in input column but removing all, predefined for certain language, texts from it. + /// The resulting creates a new column, named as specified in the output column name parameter, + /// and fills it with a vector of words containing all of the words in the input column **except the predefined list of stopwords for the specified language. /// All text comparison made by casting predefined text and text from input column to lower case using casing rules of invariant culture. /// See the See Also section for links to examples of the usage. /// ]]> @@ -1097,8 +1097,8 @@ protected override Delegate MakeGetter(DataViewRow input, int iinfo, Func) | /// | Output column data type | Unknown-sized vector of [Text]() | /// - /// The resulting creates a new column, named as specified in the output column name parameters, and - /// fills it with vector of texts similar to vector of words in input column but removing all provided by user words from it. + /// The resulting creates a new column, named as specified by the output column name parameter, and + /// fills it with a vector of words containing all of the words in the input column except those given by the stopwords parameter. /// All text comparison made by casting provided words and words from input column to lower case using casing rules of invariant culture. /// See the See Also section for links to examples of the usage. /// ]]> diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs index 341ff92f92..d4f9bbefb4 100644 --- a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs +++ b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs @@ -256,7 +256,7 @@ internal static NgramExtractingEstimator ProduceNgrams(this TransformsCatalog.Te /// /// The transform's catalog. /// Name of the column resulting from the transformation of . - /// This column's data type will be unknown-size vector of text. + /// This column's data type will be variable-sized vector of text. /// Name of the column to copy the data from. /// This estimator operates over vector of text. /// Langauge of the input text column . @@ -279,7 +279,7 @@ public static StopWordsRemovingEstimator RemoveDefaultStopWords(this TransformsC /// /// The transform's catalog. /// Name of the column resulting from the transformation of . - /// This column's data type will be unknown-size vector of text. + /// This column's data type will be variable-sized vector of text. /// Name of the column to copy the data from. /// This estimator operates over vector of text. /// Array of words to remove. From abff8d4a7681c763fc487b131da4d7aee411004b Mon Sep 17 00:00:00 2001 From: Ivan Matantsev Date: Fri, 19 Apr 2019 14:32:11 -0700 Subject: [PATCH 7/7] small update for types --- .../Text/StopWordsRemovingTransformer.cs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs b/src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs index c6eec1a2e0..8269ff18c8 100644 --- a/src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs +++ b/src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs @@ -488,8 +488,8 @@ private protected override Func GetDependenciesCore(Func a /// | | | /// | -- | -- | /// | Does this estimator need to look at the data to train its parameters? | No | - /// | Input column data type | Vector of [Text]() | - /// | Output column data type | Variable-sized vector of [Text]() | + /// | Input column data type | Vector of [Text](xref:Microsoft.ML.Data.TextDataViewType) | + /// | Output column data type | Variable-sized vector of [Text](xref:Microsoft.ML.Data.TextDataViewType) | /// /// The resulting creates a new column, named as specified in the output column name parameter, /// and fills it with a vector of words containing all of the words in the input column **except the predefined list of stopwords for the specified language. @@ -1094,8 +1094,8 @@ protected override Delegate MakeGetter(DataViewRow input, int iinfo, Func) | - /// | Output column data type | Unknown-sized vector of [Text]() | + /// | Input column data type | Vector of [Text](xref:Microsoft.ML.Data.TextDataViewType) | + /// | Output column data type | Unknown-sized vector of [Text](xref:Microsoft.ML.Data.TextDataViewType) | /// /// The resulting creates a new column, named as specified by the output column name parameter, and /// fills it with a vector of words containing all of the words in the input column except those given by the stopwords parameter.