Skip to content

Commit 145bf74

Browse files
committed
Polish word-level tokenizers
1 parent 305b2a6 commit 145bf74

File tree

15 files changed

+32
-31
lines changed

15 files changed

+32
-31
lines changed

docs/samples/Microsoft.ML.Samples/Dynamic/KeyToValueValueToKey.cs

+2-2
Original file line numberDiff line numberDiff line change
@@ -30,15 +30,15 @@ public static void Example()
3030
// making use of default settings.
3131
string defaultColumnName = "DefaultKeys";
3232
// REVIEW create through the catalog extension
33-
var default_pipeline = ml.Transforms.Text.TokenizeWords("Review")
33+
var default_pipeline = ml.Transforms.Text.ProduceWordTokens("Review")
3434
.Append(ml.Transforms.Conversion.MapValueToKey(defaultColumnName, "Review"));
3535

3636
// Another pipeline, that customizes the advanced settings of the ValueToKeyMappingEstimator.
3737
// We can change the maxNumTerm to limit how many keys will get generated out of the set of words,
3838
// and condition the order in which they get evaluated by changing sort from the default Occurence (order in which they get encountered)
3939
// to value/alphabetically.
4040
string customizedColumnName = "CustomizedKeys";
41-
var customized_pipeline = ml.Transforms.Text.TokenizeWords("Review")
41+
var customized_pipeline = ml.Transforms.Text.ProduceWordTokens("Review")
4242
.Append(ml.Transforms.Conversion.MapValueToKey(customizedColumnName, "Review", maxNumKeys: 10, sort: ValueToKeyMappingEstimator.SortOrder.Value));
4343

4444
// The transformed data.

docs/samples/Microsoft.ML.Samples/Dynamic/StopWordRemoverTransform.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ public static void Example()
2525

2626
// Let's take SentimentText column and break it into vector of words.
2727
string originalTextColumnName = "Words";
28-
var words = ml.Transforms.Text.TokenizeWords("SentimentText", originalTextColumnName);
28+
var words = ml.Transforms.Text.ProduceWordTokens("SentimentText", originalTextColumnName);
2929

3030
// Default pipeline will apply default stop word remover which is based on predifined set of words for certain languages.
3131
var defaultPipeline = words.Append(ml.Transforms.Text.RemoveDefaultStopWords(originalTextColumnName, "DefaultRemover"));

docs/samples/Microsoft.ML.Samples/Dynamic/TensorFlow/TextClassification.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ public static void Example()
6868
j.Features = features;
6969
};
7070

71-
var engine = mlContext.Transforms.Text.TokenizeWords("TokenizedWords", "Sentiment_Text")
71+
var engine = mlContext.Transforms.Text.ProduceWordTokens("TokenizedWords", "Sentiment_Text")
7272
.Append(mlContext.Transforms.Conversion.ValueMap(lookupMap, "Words", "Ids", new ColumnOptions[] { ("VariableLenghtFeatures", "TokenizedWords") }))
7373
.Append(mlContext.Transforms.CustomMapping(ResizeFeaturesAction, "Resize"))
7474
.Append(tensorFlowModel.ScoreTensorFlowModel(new[] { "Prediction/Softmax" }, new[] { "Features" }))

docs/samples/Microsoft.ML.Samples/Dynamic/WordEmbeddingTransform.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ public static void Example()
2626

2727
// Pipeline which goes through SentimentText and normalizes it, tokenize it by words, and removes default stopwords.
2828
var wordsPipeline = ml.Transforms.Text.NormalizeText("NormalizedText", "SentimentText", keepDiacritics: false, keepPunctuations: false)
29-
.Append(ml.Transforms.Text.TokenizeWords("Words", "NormalizedText"))
29+
.Append(ml.Transforms.Text.ProduceWordTokens("Words", "NormalizedText"))
3030
.Append(ml.Transforms.Text.RemoveDefaultStopWords("CleanWords", "Words"));
3131

3232
var wordsDataview = wordsPipeline.Fit(trainData).Transform(trainData);

src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ public override IEstimator<ITransformer> Reconcile(IHostEnvironment env,
5555
/// </summary>
5656
/// <param name="input">The column to apply to.</param>
5757
/// <param name="separators">The separators to use (uses space character by default).</param>
58-
public static VarVector<string> TokenizeText(this Scalar<string> input, char[] separators = null) => new OutPipelineColumn(input, separators);
58+
public static VarVector<string> ProduceWordTokens(this Scalar<string> input, char[] separators = null) => new OutPipelineColumn(input, separators);
5959
}
6060

6161
/// <summary>

src/Microsoft.ML.Transforms/Text/TextCatalog.cs

+2-13
Original file line numberDiff line numberDiff line change
@@ -160,29 +160,18 @@ public static WordEmbeddingEstimator ApplyWordEmbedding(this TransformsCatalog.T
160160
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
161161
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
162162
/// <param name="separators">The separators to use (uses space character by default).</param>
163-
public static WordTokenizingEstimator TokenizeWords(this TransformsCatalog.TextTransforms catalog,
163+
public static WordTokenizingEstimator ProduceWordTokens(this TransformsCatalog.TextTransforms catalog,
164164
string outputColumnName,
165165
string inputColumnName = null,
166166
char[] separators = null)
167167
=> new WordTokenizingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), outputColumnName, inputColumnName, separators);
168168

169-
/// <summary>
170-
/// Tokenizes incoming text in input columns and outputs the tokens using <paramref name="separators"/> as separators.
171-
/// </summary>
172-
/// <param name="catalog">The text-related transform's catalog.</param>
173-
/// <param name="columns">Pairs of columns to run the tokenization on.</param>
174-
/// <param name="separators">The separators to use (uses space character by default).</param>
175-
public static WordTokenizingEstimator TokenizeWords(this TransformsCatalog.TextTransforms catalog,
176-
(string outputColumnName, string inputColumnName)[] columns,
177-
char[] separators = null)
178-
=> new WordTokenizingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), columns, separators);
179-
180169
/// <summary>
181170
/// Tokenizes incoming text in input columns, using per-column configurations, and outputs the tokens.
182171
/// </summary>
183172
/// <param name="catalog">The text-related transform's catalog.</param>
184173
/// <param name="columns">Pairs of columns to run the tokenization on.</param>
185-
public static WordTokenizingEstimator TokenizeWords(this TransformsCatalog.TextTransforms catalog,
174+
public static WordTokenizingEstimator ProduceWordTokens(this TransformsCatalog.TextTransforms catalog,
186175
params WordTokenizingEstimator.ColumnOptions[] columns)
187176
=> new WordTokenizingEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), columns);
188177

src/Microsoft.ML.Transforms/Text/WordTokenizing.cs

+12
Original file line numberDiff line numberDiff line change
@@ -441,9 +441,21 @@ internal WordTokenizingEstimator(IHostEnvironment env, params ColumnOptions[] co
441441
}
442442
public sealed class ColumnOptions
443443
{
444+
/// <summary>
445+
/// Output column name that will be used to store the tokenization result of <see cref="InputColumnName"/> column.
446+
/// </summary>
444447
public readonly string Name;
448+
/// <summary>
449+
/// Input column name that will be tokenized into words.
450+
/// </summary>
445451
public readonly string InputColumnName;
452+
/// <summary>
453+
/// Seperator list used to tokenize input string. If not specified, space will be used.
454+
/// </summary>
446455
public IReadOnlyList<char> Separators => SeparatorsArray;
456+
/// <summary>
457+
/// State of <see cref="Separators"/>. Since <see langword="char"/>[] is multable, it's not safe to directly expose this field to users.
458+
/// </summary>
447459
internal readonly char[] SeparatorsArray;
448460

449461
/// <summary>

test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs

+4-4
Original file line numberDiff line numberDiff line change
@@ -519,7 +519,7 @@ public void Tokenize()
519519
var est = data.MakeNewEstimator()
520520
.Append(r => (
521521
r.label,
522-
tokens: r.text.TokenizeText(),
522+
tokens: r.text.ProduceWordTokens(),
523523
chars: r.text.ProduceCharacterTokens()));
524524

525525
var tdata = est.Fit(data).Transform(data);
@@ -547,7 +547,7 @@ public void NormalizeTextAndRemoveStopWords()
547547
.Append(r => (
548548
r.label,
549549
normalized_text: r.text.NormalizeText(),
550-
words_without_stopwords: r.text.TokenizeText().RemoveStopwords()));
550+
words_without_stopwords: r.text.ProduceWordTokens().RemoveStopwords()));
551551

552552
var tdata = est.Fit(data).Transform(data);
553553
var schema = tdata.AsDynamic.Schema;
@@ -604,8 +604,8 @@ public void Ngrams()
604604
var est = data.MakeNewEstimator()
605605
.Append(r => (
606606
r.label,
607-
ngrams: r.text.TokenizeText().ToKey().ToNgrams(),
608-
ngramshash: r.text.TokenizeText().ToKey().ToNgramsHash()));
607+
ngrams: r.text.ProduceWordTokens().ToKey().ToNgrams(),
608+
ngramshash: r.text.ProduceWordTokens().ToKey().ToNgramsHash()));
609609

610610
var tdata = est.Fit(data).Transform(data);
611611
var schema = tdata.AsDynamic.Schema;

test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -472,7 +472,7 @@ private void TextFeaturizationOn(string dataPath)
472472
// NLP pipeline 4: word embeddings.
473473
// PretrainedModelKind.Sswe is used here for performance of the test. In a real
474474
// scenario, it is best to use a different model for more accuracy.
475-
Embeddings: r.Message.NormalizeText().TokenizeText().WordEmbeddings(WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding)
475+
Embeddings: r.Message.NormalizeText().ProduceWordTokens().WordEmbeddings(WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding)
476476
));
477477

478478
// Let's train our pipeline, and then apply it to the same data.

test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -312,7 +312,7 @@ private void TextFeaturizationOn(string dataPath)
312312
// NLP pipeline 4: word embeddings.
313313
// PretrainedModelKind.Sswe is used here for performance of the test. In a real
314314
// scenario, it is best to use a different model for more accuracy.
315-
.Append(mlContext.Transforms.Text.TokenizeWords("TokenizedMessage", "NormalizedMessage"))
315+
.Append(mlContext.Transforms.Text.ProduceWordTokens("TokenizedMessage", "NormalizedMessage"))
316316
.Append(mlContext.Transforms.Text.ApplyWordEmbedding("Embeddings", "TokenizedMessage",
317317
WordEmbeddingEstimator.PretrainedModelKind.SentimentSpecificWordEmbedding));
318318

test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -984,7 +984,7 @@ public void TensorFlowSentimentClassificationTest()
984984
// The first pipeline 'dataPipe' tokenzies the string into words and maps each word to an integer which is an index in the dictionary.
985985
// Then this integer vector is retrieved from the pipeline and resized to fixed length.
986986
// The second pipeline 'tfEnginePipe' takes the resized integer vector and passes it to TensoFlow and gets the classification scores.
987-
var estimator = mlContext.Transforms.Text.TokenizeWords("TokenizedWords", "Sentiment_Text")
987+
var estimator = mlContext.Transforms.Text.ProduceWordTokens("TokenizedWords", "Sentiment_Text")
988988
.Append(mlContext.Transforms.Conversion.ValueMap(lookupMap, "Words", "Ids", new ColumnOptions[] { ("Features", "TokenizedWords") }));
989989
var dataPipe = estimator.Fit(dataView)
990990
.CreatePredictionEngine<TensorFlowSentiment, TensorFlowSentiment>(mlContext);

test/Microsoft.ML.Tests/Transformers/CategoricalHashTests.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ public void CategoricalHashStatic()
7676
row.ScalarString,
7777
row.VectorString,
7878
// Create a VarVector column
79-
VarVectorString: row.ScalarString.TokenizeText())).
79+
VarVectorString: row.ScalarString.ProduceWordTokens())).
8080
Append(row => (
8181
A: row.ScalarString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashScalarOutputKind.Ind),
8282
B: row.VectorString.OneHotHashEncoding(outputKind: CategoricalHashStaticExtensions.OneHotHashVectorOutputKind.Ind),

test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,7 @@ public void TextNormalizationAndStopwordRemoverWorkout()
143143
text: ctx.LoadFloat(1)), hasHeader: true)
144144
.Load(sentimentDataPath);
145145
var est = ML.Transforms.Text.NormalizeText("text")
146-
.Append(ML.Transforms.Text.TokenizeWords("words", "text"))
146+
.Append(ML.Transforms.Text.ProduceWordTokens("words", "text"))
147147
.Append(ML.Transforms.Text.RemoveDefaultStopWords("NoDefaultStopwords", "words"))
148148
.Append(ML.Transforms.Text.RemoveStopWords("NoStopWords", "words", "xbox", "this", "is", "a", "the", "THAT", "bY"));
149149

test/Microsoft.ML.Tests/Transformers/ValueMappingTests.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -546,7 +546,7 @@ public void ValueMappingInputIsVectorWorkout()
546546
var keys = new List<ReadOnlyMemory<char>>() { "foo".AsMemory(), "bar".AsMemory(), "test".AsMemory(), "wahoo".AsMemory() };
547547
var values = new List<int>() { 1, 2, 3, 4 };
548548

549-
var est = ML.Transforms.Text.TokenizeWords("TokenizeB", "B")
549+
var est = ML.Transforms.Text.ProduceWordTokens("TokenizeB", "B")
550550
.Append(ML.Transforms.Conversion.ValueMap(keys, values, new ColumnOptions[] { ("VecB", "TokenizeB") }));
551551
TestEstimatorCore(est, validFitInput: dataView, invalidInput: badDataView);
552552
}

test/Microsoft.ML.Tests/Transformers/WordEmbeddingsTests.cs

+2-2
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ public void TestWordEmbeddings()
3535
}).Load(GetDataPath(dataPath));
3636

3737
var est = ML.Transforms.Text.NormalizeText("NormalizedText", "SentimentText", keepDiacritics: false, keepPunctuations: false)
38-
.Append(ML.Transforms.Text.TokenizeWords("Words", "NormalizedText"))
38+
.Append(ML.Transforms.Text.ProduceWordTokens("Words", "NormalizedText"))
3939
.Append(ML.Transforms.Text.RemoveDefaultStopWords("CleanWords", "Words"));
4040
var words = est.Fit(data).Transform(data);
4141

@@ -70,7 +70,7 @@ public void TestCustomWordEmbeddings()
7070
}).Load(GetDataPath(dataPath));
7171

7272
var est = ML.Transforms.Text.NormalizeText("NormalizedText", "SentimentText", keepDiacritics: false, keepPunctuations: false)
73-
.Append(ML.Transforms.Text.TokenizeWords("Words", "NormalizedText"))
73+
.Append(ML.Transforms.Text.ProduceWordTokens("Words", "NormalizedText"))
7474
.Append(ML.Transforms.Text.RemoveDefaultStopWords("CleanWords", "Words"));
7575
var words = est.Fit(data).Transform(data);
7676
var pathToCustomModel = DeleteOutputPath("custommodel.txt");

0 commit comments

Comments
 (0)