Skip to content

Commit 305b2a6

Browse files
committed
Polish char-level tokenizers
1 parent 14edcb0 commit 305b2a6

File tree

7 files changed

+19
-13
lines changed

7 files changed

+19
-13
lines changed

docs/samples/Microsoft.ML.Samples/Dynamic/NgramExtraction.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ public static void NgramTransform()
2626
// A pipeline to tokenize text as characters and then combine them together into ngrams
2727
// The pipeline uses the default settings to featurize.
2828

29-
var charsPipeline = ml.Transforms.Text.TokenizeCharacters("Chars", "SentimentText", useMarkerCharacters: false);
29+
var charsPipeline = ml.Transforms.Text.ProduceCharacterTokens("Chars", "SentimentText", useMarkerCharacters: false);
3030
var ngramOnePipeline = ml.Transforms.Text.ProduceNgrams("CharsUnigrams", "Chars", ngramLength: 1);
3131
var ngramTwpPipeline = ml.Transforms.Text.ProduceNgrams("CharsTwograms", "Chars");
3232
var oneCharsPipeline = charsPipeline.Append(ngramOnePipeline);

src/Microsoft.ML.StaticPipe/TextStaticExtensions.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ public override IEstimator<ITransformer> Reconcile(IHostEnvironment env,
109109
/// </summary>
110110
/// <param name="input">The column to apply to.</param>
111111
/// <param name="useMarkerCharacters">Whether to use marker characters to separate words.</param>
112-
public static VarVector<Key<ushort, string>> TokenizeIntoCharacters(this Scalar<string> input, bool useMarkerCharacters = true) => new OutPipelineColumn(input, useMarkerCharacters);
112+
public static VarVector<Key<ushort, string>> ProduceCharacterTokens(this Scalar<string> input, bool useMarkerCharacters = true) => new OutPipelineColumn(input, useMarkerCharacters);
113113
}
114114

115115
/// <summary>

src/Microsoft.ML.Transforms/Text/TextCatalog.cs

+6-4
Original file line numberDiff line numberDiff line change
@@ -56,8 +56,9 @@ public static TextFeaturizingEstimator FeaturizeText(this TransformsCatalog.Text
5656
/// <param name="catalog">The text-related transform's catalog.</param>
5757
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
5858
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
59-
/// <param name="useMarkerCharacters">Whether to use marker characters to separate words.</param>
60-
public static TokenizingByCharactersEstimator TokenizeCharacters(this TransformsCatalog.TextTransforms catalog,
59+
/// <param name="useMarkerCharacters">Whether to prepend a marker character, <see langword="0x02"/>, to the beginning,
60+
/// and append another marker character, <see langword="0x03"/>, to the end of the output vector of characters.</param>
61+
public static TokenizingByCharactersEstimator ProduceCharacterTokens(this TransformsCatalog.TextTransforms catalog,
6162
string outputColumnName,
6263
string inputColumnName = null,
6364
bool useMarkerCharacters = CharTokenizingDefaults.UseMarkerCharacters)
@@ -68,10 +69,11 @@ public static TokenizingByCharactersEstimator TokenizeCharacters(this Transforms
6869
/// Tokenize incoming text in input columns and output the tokens as output columns.
6970
/// </summary>
7071
/// <param name="catalog">The text-related transform's catalog.</param>
71-
/// <param name="useMarkerCharacters">Whether to use marker characters to separate words.</param>
72+
/// <param name="useMarkerCharacters">Whether to prepend a marker character, <see langword="0x02"/>, to the beginning,
73+
/// and append another marker character, <see langword="0x03"/>, to the end of the output vector of characters.</param>
7274
/// <param name="columns">Pairs of columns to run the tokenization on.</param>
7375

74-
public static TokenizingByCharactersEstimator TokenizeCharacters(this TransformsCatalog.TextTransforms catalog,
76+
public static TokenizingByCharactersEstimator ProduceCharacterTokens(this TransformsCatalog.TextTransforms catalog,
7577
bool useMarkerCharacters = CharTokenizingDefaults.UseMarkerCharacters,
7678
params ColumnOptions[] columns)
7779
=> new TokenizingByCharactersEstimator(Contracts.CheckRef(catalog, nameof(catalog)).GetEnvironment(), useMarkerCharacters, ColumnOptions.ConvertToValueTuples(columns));

src/Microsoft.ML.Transforms/Text/TokenizingByCharacters.cs

+8-4
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,8 @@ private static VersionInfo GetVersionInfo()
102102
/// Tokenize incoming text in input columns and output the tokens as output columns.
103103
/// </summary>
104104
/// <param name="env">The environment.</param>
105-
/// <param name="useMarkerCharacters">Whether to use marker characters to separate words.</param>
105+
/// <param name="useMarkerCharacters">Whether to prepend a marker character, <see langword="0x02"/>, to the beginning,
106+
/// and append another marker character, <see langword="0x03"/>, to the end of the output vector of characters.</param>
106107
/// <param name="columns">Pairs of columns to run the tokenization on.</param>
107108
internal TokenizingByCharactersTransformer(IHostEnvironment env, bool useMarkerCharacters = TokenizingByCharactersEstimator.Defaults.UseMarkerCharacters,
108109
params (string outputColumnName, string inputColumnName)[] columns) :
@@ -114,7 +115,7 @@ internal TokenizingByCharactersTransformer(IHostEnvironment env, bool useMarkerC
114115
/// <summary>
115116
/// The names of the output and input column pairs on which the transformation is applied.
116117
/// </summary>
117-
public IReadOnlyCollection<(string outputColumnName, string inputColumnName)> Columns => ColumnPairs.AsReadOnly();
118+
internal IReadOnlyCollection<(string outputColumnName, string inputColumnName)> Columns => ColumnPairs.AsReadOnly();
118119

119120
private protected override void CheckInputColumn(DataViewSchema inputSchema, int col, int srcCol)
120121
{
@@ -555,6 +556,7 @@ internal static class Defaults
555556
{
556557
public const bool UseMarkerCharacters = true;
557558
}
559+
558560
internal static bool IsColumnTypeValid(DataViewType type) => type.GetItemType() is TextDataViewType;
559561

560562
internal const string ExpectedColumnType = "Text";
@@ -565,7 +567,8 @@ internal static class Defaults
565567
/// <param name="env">The environment.</param>
566568
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
567569
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
568-
/// <param name="useMarkerCharacters">Whether to use marker characters to separate words.</param>
570+
/// <param name="useMarkerCharacters">Whether to prepend a marker character, <see langword="0x02"/>, to the beginning,
571+
/// and append another marker character, <see langword="0x03"/>, to the end of the output vector of characters.</param>
569572
internal TokenizingByCharactersEstimator(IHostEnvironment env, string outputColumnName, string inputColumnName = null,
570573
bool useMarkerCharacters = Defaults.UseMarkerCharacters)
571574
: this(env, useMarkerCharacters, new[] { (outputColumnName, inputColumnName ?? outputColumnName) })
@@ -576,7 +579,8 @@ internal TokenizingByCharactersEstimator(IHostEnvironment env, string outputColu
576579
/// Tokenize incoming text in input columns and output the tokens as output columns.
577580
/// </summary>
578581
/// <param name="env">The environment.</param>
579-
/// <param name="useMarkerCharacters">Whether to use marker characters to separate words.</param>
582+
/// <param name="useMarkerCharacters">Whether to prepend a marker character, <see langword="0x02"/>, to the beginning,
583+
/// and append another marker character, <see langword="0x03"/>, to the end of the output vector of characters.</param>
580584
/// <param name="columns">Pairs of columns to run the tokenization on.</param>
581585

582586
internal TokenizingByCharactersEstimator(IHostEnvironment env, bool useMarkerCharacters = Defaults.UseMarkerCharacters,

test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -520,7 +520,7 @@ public void Tokenize()
520520
.Append(r => (
521521
r.label,
522522
tokens: r.text.TokenizeText(),
523-
chars: r.text.TokenizeIntoCharacters()));
523+
chars: r.text.ProduceCharacterTokens()));
524524

525525
var tdata = est.Fit(data).Transform(data);
526526
var schema = tdata.AsDynamic.Schema;

test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -467,7 +467,7 @@ private void TextFeaturizationOn(string dataPath)
467467
BagOfBigrams: r.Message.NormalizeText().ToBagofHashedWords(ngramLength: 2, allLengths: false),
468468

469469
// NLP pipeline 3: bag of tri-character sequences with TF-IDF weighting.
470-
BagOfTrichar: r.Message.TokenizeIntoCharacters().ToNgrams(ngramLength: 3, weighting: NgramExtractingEstimator.WeightingCriteria.TfIdf),
470+
BagOfTrichar: r.Message.ProduceCharacterTokens().ToNgrams(ngramLength: 3, weighting: NgramExtractingEstimator.WeightingCriteria.TfIdf),
471471

472472
// NLP pipeline 4: word embeddings.
473473
// PretrainedModelKind.Sswe is used here for performance of the test. In a real

test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -305,7 +305,7 @@ private void TextFeaturizationOn(string dataPath)
305305
ngramLength: 2, allLengths: false))
306306

307307
// NLP pipeline 3: bag of tri-character sequences with TF-IDF weighting.
308-
.Append(mlContext.Transforms.Text.TokenizeCharacters("MessageChars", "Message"))
308+
.Append(mlContext.Transforms.Text.ProduceCharacterTokens("MessageChars", "Message"))
309309
.Append(new NgramExtractingEstimator(mlContext, "BagOfTrichar", "MessageChars",
310310
ngramLength: 3, weighting: NgramExtractingEstimator.WeightingCriteria.TfIdf))
311311

0 commit comments

Comments
 (0)