Skip to content

Commit 950f133

Browse files
authored
Created samples for TokenizeIntoWords and RemoveStopWords APIs. (#3156)
1 parent 51fc8b2 commit 950f133

File tree

9 files changed

+196
-90
lines changed

9 files changed

+196
-90
lines changed

docs/samples/Microsoft.ML.Samples/Dynamic/StopWordRemoverTransform.cs

Lines changed: 0 additions & 82 deletions
This file was deleted.

docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyCustomWordEmbedding.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ public static void Example()
1313
// as well as the source of randomness.
1414
var mlContext = new MLContext();
1515

16-
// Create an empty data sample list. The 'ApplyWordEmbedding' does not require training data as
16+
// Create an empty list as the dataset. The 'ApplyWordEmbedding' does not require training data as
1717
// the estimator ('WordEmbeddingEstimator') created by 'ApplyWordEmbedding' API is not a trainable estimator.
1818
// The empty list is only needed to pass input schema to the pipeline.
1919
var emptySamples = new List<TextData>();

docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ApplyWordEmbedding.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ public static void Example()
1212
// as well as the source of randomness.
1313
var mlContext = new MLContext();
1414

15-
// Create an empty data sample list. The 'ApplyWordEmbedding' does not require training data as
15+
// Create an empty list as the dataset. The 'ApplyWordEmbedding' does not require training data as
1616
// the estimator ('WordEmbeddingEstimator') created by 'ApplyWordEmbedding' API is not a trainable estimator.
1717
// The empty list is only needed to pass input schema to the pipeline.
1818
var emptySamples = new List<TextData>();

docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/NormalizeText.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ public static void Example()
1212
// as well as the source of randomness.
1313
var mlContext = new MLContext();
1414

15-
// Create an empty data sample list. The 'NormalizeText' API does not require training data as
15+
// Create an empty list as the dataset. The 'NormalizeText' API does not require training data as
1616
// the estimator ('TextNormalizingEstimator') created by 'NormalizeText' API is not a trainable estimator.
1717
// The empty list is only needed to pass input schema to the pipeline.
1818
var emptySamples = new List<TextData>();
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using Microsoft.ML.Transforms.Text;
4+
5+
namespace Microsoft.ML.Samples.Dynamic
6+
{
7+
public static class RemoveDefaultStopWords
8+
{
9+
public static void Example()
10+
{
11+
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
12+
// as well as the source of randomness.
13+
var mlContext = new MLContext();
14+
15+
// Create an empty list as the dataset. The 'RemoveDefaultStopWords' does not require training data as
16+
// the estimator ('StopWordsRemovingEstimator') created by 'RemoveDefaultStopWords' API is not a trainable estimator.
17+
// The empty list is only needed to pass input schema to the pipeline.
18+
var emptySamples = new List<TextData>();
19+
20+
// Convert sample list to an empty IDataView.
21+
var emptyDataView = mlContext.Data.LoadFromEnumerable(emptySamples);
22+
23+
// A pipeline for removing stop words from input text/string.
24+
// The pipeline first tokenizes text into words then removes stop words.
25+
// The 'RemoveDefaultStopWords' API ignores casing of the text/string e.g. 'tHe' and 'the' are considered the same stop words.
26+
var textPipeline = mlContext.Transforms.Text.TokenizeIntoWords("Words", "Text")
27+
.Append(mlContext.Transforms.Text.RemoveDefaultStopWords("WordsWithoutStopWords", "Words", language: StopWordsRemovingEstimator.Language.English));
28+
29+
// Fit to data.
30+
var textTransformer = textPipeline.Fit(emptyDataView);
31+
32+
// Create the prediction engine to remove the stop words from the input text/string.
33+
var predictionEngine = mlContext.Model.CreatePredictionEngine<TextData, TransformedTextData>(textTransformer);
34+
35+
// Call the prediction API to remove stop words.
36+
var data = new TextData() { Text = "ML.NET's RemoveDefaultStopWords API removes stop words from tHe text/string. It requires the text/string to be tokenized beforehand." };
37+
var prediction = predictionEngine.Predict(data);
38+
39+
// Print the length of the word vector after the stop words removed.
40+
Console.WriteLine($"Number of words: {prediction.WordsWithoutStopWords.Length}");
41+
42+
// Print the word vector without stop words.
43+
Console.WriteLine($"\nWords without stop words: {string.Join(",", prediction.WordsWithoutStopWords)}");
44+
45+
// Expected output:
46+
// Number of words: 11
47+
// Words without stop words: ML.NET's,RemoveDefaultStopWords,API,removes,stop,words,text/string.,requires,text/string,tokenized,beforehand.
48+
}
49+
50+
public class TextData
51+
{
52+
public string Text { get; set; }
53+
}
54+
55+
public class TransformedTextData : TextData
56+
{
57+
public string[] WordsWithoutStopWords { get; set; }
58+
}
59+
}
60+
}
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using Microsoft.ML.Transforms.Text;
4+
5+
namespace Microsoft.ML.Samples.Dynamic
6+
{
7+
public static class RemoveStopWords
8+
{
9+
public static void Example()
10+
{
11+
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
12+
// as well as the source of randomness.
13+
var mlContext = new MLContext();
14+
15+
// Create an empty list as the dataset. The 'RemoveStopWords' does not require training data as
16+
// the estimator ('CustomStopWordsRemovingEstimator') created by 'RemoveStopWords' API is not a trainable estimator.
17+
// The empty list is only needed to pass input schema to the pipeline.
18+
var emptySamples = new List<TextData>();
19+
20+
// Convert sample list to an empty IDataView.
21+
var emptyDataView = mlContext.Data.LoadFromEnumerable(emptySamples);
22+
23+
// A pipeline for removing stop words from input text/string.
24+
// The pipeline first tokenizes text into words then removes stop words.
25+
// The 'RemoveStopWords' API ignores casing of the text/string e.g. 'tHe' and 'the' are considered the same stop words.
26+
var textPipeline = mlContext.Transforms.Text.TokenizeIntoWords("Words", "Text")
27+
.Append(mlContext.Transforms.Text.RemoveStopWords("WordsWithoutStopWords", "Words", stopwords: new[] { "a", "the", "from", "by" }));
28+
29+
// Fit to data.
30+
var textTransformer = textPipeline.Fit(emptyDataView);
31+
32+
// Create the prediction engine to remove the stop words from the input text/string.
33+
var predictionEngine = mlContext.Model.CreatePredictionEngine<TextData, TransformedTextData>(textTransformer);
34+
35+
// Call the prediction API to remove stop words.
36+
var data = new TextData() { Text = "ML.NET's RemoveStopWords API removes stop words from tHe text/string using a list of stop words provided by the user." };
37+
var prediction = predictionEngine.Predict(data);
38+
39+
// Print the length of the word vector after the stop words removed.
40+
Console.WriteLine($"Number of words: {prediction.WordsWithoutStopWords.Length}");
41+
42+
// Print the word vector without stop words.
43+
Console.WriteLine($"\nWords without stop words: {string.Join(",", prediction.WordsWithoutStopWords)}");
44+
45+
// Expected output:
46+
// Number of words: 14
47+
// Words without stop words: ML.NET's,RemoveStopWords,API,removes,stop,words,text/string,using,list,of,stop,words,provided,user.
48+
}
49+
50+
public class TextData
51+
{
52+
public string Text { get; set; }
53+
}
54+
55+
public class TransformedTextData : TextData
56+
{
57+
public string[] WordsWithoutStopWords { get; set; }
58+
}
59+
}
60+
}

docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/TokenizeIntoCharactersAsKeys.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ public static void Example()
1212
// as well as the source of randomness.
1313
var mlContext = new MLContext();
1414

15-
// Create an empty data sample list. The 'TokenizeIntoCharactersAsKeys' does not require training data as
15+
// Create an empty list as the dataset. The 'TokenizeIntoCharactersAsKeys' does not require training data as
1616
// the estimator ('TokenizingByCharactersEstimator') created by 'TokenizeIntoCharactersAsKeys' API is not a trainable estimator.
1717
// The empty list is only needed to pass input schema to the pipeline.
1818
var emptySamples = new List<TextData>();
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.Text;
4+
5+
namespace Microsoft.ML.Samples.Dynamic
6+
{
7+
public static class TokenizeIntoWords
8+
{
9+
public static void Example()
10+
{
11+
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
12+
// as well as the source of randomness.
13+
var mlContext = new MLContext();
14+
15+
// Create an empty list as the dataset. The 'TokenizeIntoWords' does not require training data as
16+
// the estimator ('WordTokenizingEstimator') created by 'TokenizeIntoWords' API is not a trainable estimator.
17+
// The empty list is only needed to pass input schema to the pipeline.
18+
var emptySamples = new List<TextData>();
19+
20+
// Convert sample list to an empty IDataView.
21+
var emptyDataView = mlContext.Data.LoadFromEnumerable(emptySamples);
22+
23+
// A pipeline for converting text into vector of words.
24+
// The following call to 'TokenizeIntoWords' tokenizes text/string into words using space as a separator.
25+
// Space is also a default value for the 'separators' argument if it is not specified.
26+
var textPipeline = mlContext.Transforms.Text.TokenizeIntoWords("Words", "Text", separators: new[] { ' ' });
27+
28+
// Fit to data.
29+
var textTransformer = textPipeline.Fit(emptyDataView);
30+
31+
// Create the prediction engine to get the word vector from the input text/string.
32+
var predictionEngine = mlContext.Model.CreatePredictionEngine<TextData, TransformedTextData>(textTransformer);
33+
34+
// Call the prediction API to convert the text into words.
35+
var data = new TextData() { Text = "ML.NET's TokenizeIntoWords API splits text/string into words using the list of characters provided as separators." };
36+
var prediction = predictionEngine.Predict(data);
37+
38+
// Print the length of the word vector.
39+
Console.WriteLine($"Number of words: {prediction.Words.Length}");
40+
41+
// Print the word vector.
42+
Console.WriteLine($"\nWords: {string.Join(",", prediction.Words)}");
43+
44+
// Expected output:
45+
// Number of words: 15
46+
// Words: ML.NET's,TokenizeIntoWords,API,splits,text/string,into,words,using,the,list,of,characters,provided,as,separators.
47+
}
48+
49+
public class TextData
50+
{
51+
public string Text { get; set; }
52+
}
53+
54+
public class TransformedTextData : TextData
55+
{
56+
public string[] Words { get; set; }
57+
}
58+
}
59+
}

src/Microsoft.ML.Transforms/Text/TextCatalog.cs

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,13 @@ internal static WordEmbeddingEstimator ApplyWordEmbedding(this TransformsCatalog
186186
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
187187
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
188188
/// <param name="separators">The separators to use (uses space character by default).</param>
189+
/// <example>
190+
/// <format type="text/markdown">
191+
/// <![CDATA[
192+
/// [!code-csharp[TokenizeIntoWords](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/TokenizeIntoWords.cs)]
193+
/// ]]>
194+
/// </format>
195+
/// </example>
189196
public static WordTokenizingEstimator TokenizeIntoWords(this TransformsCatalog.TextTransforms catalog,
190197
string outputColumnName,
191198
string inputColumnName = null,
@@ -254,8 +261,9 @@ internal static NgramExtractingEstimator ProduceNgrams(this TransformsCatalog.Te
254261
/// <example>
255262
/// <format type="text/markdown">
256263
/// <![CDATA[
257-
/// [!code-csharp[FastTree](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/StopWordRemoverTransform.cs)]
258-
/// ]]></format>
264+
/// [!code-csharp[RemoveDefaultStopWords](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/RemoveDefaultStopWords.cs)]
265+
/// ]]>
266+
/// </format>
259267
/// </example>
260268
public static StopWordsRemovingEstimator RemoveDefaultStopWords(this TransformsCatalog.TextTransforms catalog,
261269
string outputColumnName,
@@ -274,8 +282,9 @@ public static StopWordsRemovingEstimator RemoveDefaultStopWords(this TransformsC
274282
/// <example>
275283
/// <format type="text/markdown">
276284
/// <![CDATA[
277-
/// [!code-csharp[FastTree](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/StopWordRemoverTransform.cs)]
278-
/// ]]></format>
285+
/// [!code-csharp[RemoveStopWords](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/RemoveStopWords.cs)]
286+
/// ]]>
287+
/// </format>
279288
/// </example>
280289
public static CustomStopWordsRemovingEstimator RemoveStopWords(this TransformsCatalog.TextTransforms catalog,
281290
string outputColumnName,

0 commit comments

Comments
 (0)