Skip to content

Commit f41e7f4

Browse files
Ivanidzo4kashauheen
authored andcommitted
Stopwords remover xml documentation (#3413)
* Update documentation for stopwords
1 parent 610ffcb commit f41e7f4

File tree

2 files changed

+52
-23
lines changed

2 files changed

+52
-23
lines changed

src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs

+38-13
Original file line numberDiff line numberDiff line change
@@ -61,10 +61,7 @@ public ITransformer CreateComponent(IHostEnvironment env, IDataView input, OneTo
6161
}
6262

6363
/// <summary>
64-
/// A Stopword remover transform based on language-specific lists of stop words (most common words)
65-
/// from Office Named Entity Recognition project.
66-
/// The transform is usually applied after tokenizing text, so it compares individual tokens
67-
/// (case-insensitive comparison) to the stopwords.
64+
/// <see cref="ITransformer"/> resulting from fitting a <see cref="StopWordsRemovingEstimator"/>.
6865
/// </summary>
6966
public sealed class StopWordsRemovingTransformer : OneToOneTransformerBase
7067
{
@@ -483,10 +480,25 @@ private protected override Func<int, bool> GetDependenciesCore(Func<int, bool> a
483480
}
484481

485482
/// <summary>
486-
/// Stopword remover removes language-specific list of stop words (most common words)
487-
/// This is usually applied after tokenizing text, so it compares individual tokens
488-
/// (case-insensitive comparison) to the stopwords.
483+
/// <see cref="IEstimator{TTransformer}"/> for the <see cref="CustomStopWordsRemovingTransformer"/>.
489484
/// </summary>
485+
/// <remarks>
486+
/// <format type="text/markdown"><![CDATA[
487+
/// ### Estimator Characteristics
488+
/// | | |
489+
/// | -- | -- |
490+
/// | Does this estimator need to look at the data to train its parameters? | No |
491+
/// | Input column data type | Vector of [Text](xref:Microsoft.ML.Data.TextDataViewType) |
492+
/// | Output column data type | Variable-sized vector of [Text](xref:Microsoft.ML.Data.TextDataViewType) |
493+
///
494+
/// The resulting <xref:Microsoft.ML.Transforms.Text.StopWordsRemovingTransformer/> creates a new column, named as specified in the output column name parameter,
495+
/// and fills it with a vector of words containing all of the words in the input column **except the predefined list of stopwords for the specified language.
496+
/// All text comparison made by casting predefined text and text from input column to lower case using casing rules of invariant culture.
497+
/// See the See Also section for links to examples of the usage.
498+
/// ]]>
499+
/// </format>
500+
/// </remarks>
501+
/// <seealso cref="TextCatalog.RemoveDefaultStopWords(TransformsCatalog.TextTransforms, string, string, Language)" />
490502
public sealed class StopWordsRemovingEstimator : TrivialEstimator<StopWordsRemovingTransformer>
491503
{
492504
/// <summary>
@@ -627,9 +639,7 @@ public override SchemaShape GetOutputSchema(SchemaShape inputSchema)
627639
}
628640

629641
/// <summary>
630-
/// Custom stopword remover removes specified list of stop words.
631-
/// This is usually applied after tokenizing text, so it compares individual tokens
632-
/// (case-insensitive comparison) to the stopwords.
642+
/// <see cref="ITransformer"/> resulting from fitting a <see cref="CustomStopWordsRemovingEstimator"/>.
633643
/// </summary>
634644
public sealed class CustomStopWordsRemovingTransformer : OneToOneTransformerBase
635645
{
@@ -1076,10 +1086,25 @@ protected override Delegate MakeGetter(DataViewRow input, int iinfo, Func<int, b
10761086
}
10771087

10781088
/// <summary>
1079-
/// Custom stopword remover removes specified list of stop words.
1080-
/// This is usually applied after tokenizing text, so it compares individual tokens
1081-
/// (case-insensitive comparison) to the stopwords.
1089+
/// <see cref="IEstimator{TTransformer}"/> for the <see cref="CustomStopWordsRemovingTransformer"/>.
10821090
/// </summary>
1091+
/// <remarks>
1092+
/// <format type="text/markdown"><![CDATA[
1093+
/// ### Estimator Characteristics
1094+
/// | | |
1095+
/// | -- | -- |
1096+
/// | Does this estimator need to look at the data to train its parameters? | No |
1097+
/// | Input column data type | Vector of [Text](xref:Microsoft.ML.Data.TextDataViewType) |
1098+
/// | Output column data type | Unknown-sized vector of [Text](xref:Microsoft.ML.Data.TextDataViewType) |
1099+
///
1100+
/// The resulting <xref:Microsoft.ML.Transforms.Text.CustomStopWordsRemovingTransformer/> creates a new column, named as specified by the output column name parameter, and
1101+
/// fills it with a vector of words containing all of the words in the input column except those given by the stopwords parameter.
1102+
/// All text comparison made by casting provided words and words from input column to lower case using casing rules of invariant culture.
1103+
/// See the See Also section for links to examples of the usage.
1104+
/// ]]>
1105+
/// </format>
1106+
/// </remarks>
1107+
/// <seealso cref="TextCatalog.RemoveStopWords(TransformsCatalog.TextTransforms, string, string, string[])" />
10831108
public sealed class CustomStopWordsRemovingEstimator : TrivialEstimator<CustomStopWordsRemovingTransformer>
10841109
{
10851110
/// <summary>

src/Microsoft.ML.Transforms/Text/TextCatalog.cs

+14-10
Original file line numberDiff line numberDiff line change
@@ -277,12 +277,14 @@ internal static NgramExtractingEstimator ProduceNgrams(this TransformsCatalog.Te
277277
=> new NgramExtractingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), columns);
278278

279279
/// <summary>
280-
/// Removes stop words from incoming token streams in <paramref name="inputColumnName"/>
281-
/// and outputs the token streams without stopwords as <paramref name="outputColumnName"/>.
280+
/// Create a <see cref="CustomStopWordsRemovingEstimator"/>, which copies the data from the column specified in <paramref name="inputColumnName"/>
281+
/// to a new column: <paramref name="outputColumnName"/> and removes predifined set of text specific for <paramref name="language"/> from it.
282282
/// </summary>
283-
/// <param name="catalog">The text-related transform's catalog.</param>
284-
/// <param name="outputColumnName">The column containing output text. Null means <paramref name="inputColumnName"/> is replaced.</param>
285-
/// <param name="inputColumnName">The column containing text to remove stop words on.</param>
283+
/// <param name="catalog">The transform's catalog.</param>
284+
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.
285+
/// This column's data type will be variable-sized vector of text.</param>
286+
/// <param name="inputColumnName">Name of the column to copy the data from.
287+
/// This estimator operates over vector of text.</param>
286288
/// <param name="language">Langauge of the input text column <paramref name="inputColumnName"/>.</param>
287289
/// <example>
288290
/// <format type="text/markdown">
@@ -298,12 +300,14 @@ public static StopWordsRemovingEstimator RemoveDefaultStopWords(this TransformsC
298300
=> new StopWordsRemovingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, inputColumnName, language);
299301

300302
/// <summary>
301-
/// Removes stop words from incoming token streams in <paramref name="inputColumnName"/>
302-
/// and outputs the token streams without stopwords as <paramref name="outputColumnName"/>.
303+
/// Create a <see cref="CustomStopWordsRemovingEstimator"/>, which copies the data from the column specified in <paramref name="inputColumnName"/>
304+
/// to a new column: <paramref name="outputColumnName"/> and removes text specified in <paramref name="stopwords"/> from it.
303305
/// </summary>
304-
/// <param name="catalog">The text-related transform's catalog.</param>
305-
/// <param name="outputColumnName">The column containing output text. Null means <paramref name="inputColumnName"/> is replaced.</param>
306-
/// <param name="inputColumnName">The column containing text to remove stop words on.</param>
306+
/// <param name="catalog">The transform's catalog.</param>
307+
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.
308+
/// This column's data type will be variable-sized vector of text.</param>
309+
/// <param name="inputColumnName">Name of the column to copy the data from.
310+
/// This estimator operates over vector of text.</param>
307311
/// <param name="stopwords">Array of words to remove.</param>
308312
/// <example>
309313
/// <format type="text/markdown">

0 commit comments

Comments
 (0)