From b79c747992f22aafe1ca83e7de5f2abaa95b9fb5 Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Thu, 28 Mar 2019 14:43:20 -0700 Subject: [PATCH 1/2] Created sample for text normalizing API. --- .../Dynamic/Transforms/Text/NormalizeText.cs | 57 +++++++++++++++++++ .../Text/TextCatalog.cs | 14 +++++ 2 files changed, 71 insertions(+) create mode 100644 docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/NormalizeText.cs diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/NormalizeText.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/NormalizeText.cs new file mode 100644 index 0000000000..7f18469f41 --- /dev/null +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/NormalizeText.cs @@ -0,0 +1,57 @@ +using System; +using System.Collections.Generic; +using System.Text; + +namespace Microsoft.ML.Samples.Dynamic +{ + public static class NormalizeText + { + public static void Example() + { + // Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, + // as well as the source of randomness. + var mlContext = new MLContext(); + + // Create an empty data sample list. The 'NormalizeText' API does not require training data as + // the estimator ('TextNormalizingEstimator') created by 'NormalizeText' API is not a trainable estimator. + // The empty list is only needed to pass input schema to the pipeline. + var samples = new List(); + + // Convert sample list to an empty IDataView. + var dataview = mlContext.Data.LoadFromEnumerable(samples); + + // A pipeline for normalizing text. + var normTextPipeline = mlContext.Transforms.Text.NormalizeText("NormalizedText", "Text", + Transforms.Text.TextNormalizingEstimator.CaseMode.Lower, + keepDiacritics: false, + keepPunctuations: false, + keepNumbers: false); + + // Fit to data. + var normTextTransformer = normTextPipeline.Fit(dataview); + + // Create the prediction engine to get the normalized text from the input text/string. + var predictionEngine = mlContext.Model.CreatePredictionEngine(normTextTransformer); + + // Call the prediction API. + var data = new TextData() { Text = "ML.NET's NormalizeText API changes the case of the TEXT and removes/keeps diâcrîtîcs, punctuations, and/or numbers (123)." }; + var prediction = predictionEngine.Predict(data); + + // Print the normalized text. + Console.WriteLine($"Normalized Text: {prediction.NormalizedText}"); + + // Expected output: + // Normalized Text: mlnets normalizetext api changes the case of the text and removeskeeps diacritics punctuations andor numbers + } + + public class TextData + { + public string Text { get; set; } + } + + public class TransformedTextData : TextData + { + public string NormalizedText { get; set; } + } + } +} diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs index e2baf1578e..605b465af1 100644 --- a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs +++ b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs @@ -57,6 +57,13 @@ public static TextFeaturizingEstimator FeaturizeText(this TransformsCatalog.Text /// Name of the column to transform. If set to , the value of the will be used as source. /// Whether to prepend a marker character, , to the beginning, /// and append another marker character, , to the end of the output vector of characters. + /// + /// + /// + /// + /// public static TokenizingByCharactersEstimator TokenizeIntoCharactersAsKeys(this TransformsCatalog.TextTransforms catalog, string outputColumnName, string inputColumnName = null, @@ -93,6 +100,13 @@ internal static TokenizingByCharactersEstimator TokenizeIntoCharactersAsKeys(thi /// Whether to keep diacritical marks or remove them. /// Whether to keep punctuation marks or remove them. /// Whether to keep numbers or remove them. + /// + /// + /// + /// + /// public static TextNormalizingEstimator NormalizeText(this TransformsCatalog.TextTransforms catalog, string outputColumnName, string inputColumnName = null, From f4be9702597e4634a72096b9bb2a6713f40cfedf Mon Sep 17 00:00:00 2001 From: Zeeshan Ahmed Date: Thu, 28 Mar 2019 14:53:36 -0700 Subject: [PATCH 2/2] Some renaming and typos removed. --- .../Dynamic/Transforms/Text/NormalizeText.cs | 6 +++--- src/Microsoft.ML.Transforms/Text/TextCatalog.cs | 7 ------- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/NormalizeText.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/NormalizeText.cs index 7f18469f41..920ea4353c 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/NormalizeText.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/NormalizeText.cs @@ -15,10 +15,10 @@ public static void Example() // Create an empty data sample list. The 'NormalizeText' API does not require training data as // the estimator ('TextNormalizingEstimator') created by 'NormalizeText' API is not a trainable estimator. // The empty list is only needed to pass input schema to the pipeline. - var samples = new List(); + var emptySamples = new List(); // Convert sample list to an empty IDataView. - var dataview = mlContext.Data.LoadFromEnumerable(samples); + var emptyDataView = mlContext.Data.LoadFromEnumerable(emptySamples); // A pipeline for normalizing text. var normTextPipeline = mlContext.Transforms.Text.NormalizeText("NormalizedText", "Text", @@ -28,7 +28,7 @@ public static void Example() keepNumbers: false); // Fit to data. - var normTextTransformer = normTextPipeline.Fit(dataview); + var normTextTransformer = normTextPipeline.Fit(emptyDataView); // Create the prediction engine to get the normalized text from the input text/string. var predictionEngine = mlContext.Model.CreatePredictionEngine(normTextTransformer); diff --git a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs index 605b465af1..391b8d4f91 100644 --- a/src/Microsoft.ML.Transforms/Text/TextCatalog.cs +++ b/src/Microsoft.ML.Transforms/Text/TextCatalog.cs @@ -57,13 +57,6 @@ public static TextFeaturizingEstimator FeaturizeText(this TransformsCatalog.Text /// Name of the column to transform. If set to , the value of the will be used as source. /// Whether to prepend a marker character, , to the beginning, /// and append another marker character, , to the end of the output vector of characters. - /// - /// - /// - /// - /// public static TokenizingByCharactersEstimator TokenizeIntoCharactersAsKeys(this TransformsCatalog.TextTransforms catalog, string outputColumnName, string inputColumnName = null,