Skip to content

Commit 233bc2d

Browse files
authored
Created samples for 'FeaturizeText' API. (#3120)
1 parent 70c6418 commit 233bc2d

File tree

3 files changed

+162
-1
lines changed

3 files changed

+162
-1
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using Microsoft.ML.Data;
4+
5+
namespace Microsoft.ML.Samples.Dynamic
6+
{
7+
public static class FeaturizeText
8+
{
9+
public static void Example()
10+
{
11+
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
12+
// as well as the source of randomness.
13+
var mlContext = new MLContext();
14+
15+
// Create a small dataset as an IEnumerable.
16+
var samples = new List<TextData>()
17+
{
18+
new TextData(){ Text = "ML.NET's FeaturizeText API uses a composition of several basic transforms to convert text into numeric features." },
19+
new TextData(){ Text = "This API can be used as a featurizer to perform text classification." },
20+
new TextData(){ Text = "There are a number of approaches to text classification." },
21+
new TextData(){ Text = "One of the simplest and most common approaches is called “Bag of Words”." },
22+
new TextData(){ Text = "Text classification can be used for a wide variety of tasks" },
23+
new TextData(){ Text = "such as sentiment analysis, topic detection, intent identification etc." },
24+
};
25+
26+
// Convert training data to IDataView.
27+
var dataview = mlContext.Data.LoadFromEnumerable(samples);
28+
29+
// A pipeline for converting text into numeric features.
30+
// The following call to 'FeaturizeText' instantiates 'TextFeaturizingEstimator' with default parameters.
31+
// The default settings for the TextFeaturizingEstimator are
32+
// * StopWordsRemover: None
33+
// * CaseMode: Lowercase
34+
// * OutputTokensColumnName: None
35+
// * KeepDiacritics: false, KeepPunctuations: true, KeepNumbers: true
36+
// * WordFeatureExtractor: NgramLength = 1
37+
// * CharFeatureExtractor: NgramLength = 3, UseAllLengths = false
38+
// The length of the output feature vector depends on these settings.
39+
var textPipeline = mlContext.Transforms.Text.FeaturizeText("Features", "Text");
40+
41+
// Fit to data.
42+
var textTransformer = textPipeline.Fit(dataview);
43+
44+
// Create the prediction engine to get the features extracted from the text.
45+
var predictionEngine = mlContext.Model.CreatePredictionEngine<TextData, TransformedTextData>(textTransformer);
46+
47+
// Convert the text into numeric features.
48+
var prediction = predictionEngine.Predict(samples[0]);
49+
50+
// Print the length of the feature vector.
51+
Console.WriteLine($"Number of Features: {prediction.Features.Length}");
52+
53+
// Print the first 10 feature values.
54+
Console.Write("Features: ");
55+
for (int i = 0; i < 10; i++)
56+
Console.Write($"{prediction.Features[i]:F4} ");
57+
58+
// Expected output:
59+
// Number of Features: 332
60+
// Features: 0.0857 0.0857 0.0857 0.0857 0.0857 0.0857 0.0857 0.0857 0.0857 0.1715 ...
61+
}
62+
63+
public class TextData
64+
{
65+
public string Text { get; set; }
66+
}
67+
68+
public class TransformedTextData : TextData
69+
{
70+
public float[] Features { get; set; }
71+
}
72+
}
73+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using Microsoft.ML.Data;
4+
using Microsoft.ML.Transforms.Text;
5+
6+
namespace Microsoft.ML.Samples.Dynamic
7+
{
8+
public static class FeaturizeTextWithOptions
9+
{
10+
public static void Example()
11+
{
12+
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
13+
// as well as the source of randomness.
14+
var mlContext = new MLContext();
15+
16+
// Create a small dataset as an IEnumerable.
17+
var samples = new List<TextData>()
18+
{
19+
new TextData(){ Text = "ML.NET's FeaturizeText API uses a composition of several basic transforms to convert text into numeric features." },
20+
new TextData(){ Text = "This API can be used as a featurizer to perform text classification." },
21+
new TextData(){ Text = "There are a number of approaches to text classification." },
22+
new TextData(){ Text = "One of the simplest and most common approaches is called “Bag of Words”." },
23+
new TextData(){ Text = "Text classification can be used for a wide variety of tasks" },
24+
new TextData(){ Text = "such as sentiment analysis, topic detection, intent identification etc." },
25+
};
26+
27+
// Convert training data to IDataView.
28+
var dataview = mlContext.Data.LoadFromEnumerable(samples);
29+
30+
// A pipeline for converting text into numeric features.
31+
// The following call to 'FeaturizeText' instantiates 'TextFeaturizingEstimator' with given parameters.
32+
// The length of the output feature vector depends on these settings.
33+
var options = new TextFeaturizingEstimator.Options()
34+
{
35+
// Also output tokenized words
36+
OutputTokensColumnName = "OutputTokens",
37+
CaseMode = TextNormalizingEstimator.CaseMode.Lower,
38+
// Use ML.NET's built-in stop word remover
39+
StopWordsRemoverOptions = new StopWordsRemovingEstimator.Options() { Language = TextFeaturizingEstimator.Language.English },
40+
WordFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 2, UseAllLengths = true },
41+
CharFeatureExtractor = new WordBagEstimator.Options() { NgramLength = 3, UseAllLengths= false },
42+
};
43+
var textPipeline = mlContext.Transforms.Text.FeaturizeText("Features", options, "Text");
44+
45+
// Fit to data.
46+
var textTransformer = textPipeline.Fit(dataview);
47+
48+
// Create the prediction engine to get the features extracted from the text.
49+
var predictionEngine = mlContext.Model.CreatePredictionEngine<TextData, TransformedTextData>(textTransformer);
50+
51+
// Convert the text into numeric features.
52+
var prediction = predictionEngine.Predict(samples[0]);
53+
54+
// Print the length of the feature vector.
55+
Console.WriteLine($"Number of Features: {prediction.Features.Length}");
56+
57+
// Print feature values and tokens.
58+
Console.Write("Features: ");
59+
for (int i = 0; i < 10; i++)
60+
Console.Write($"{prediction.Features[i]:F4} ");
61+
62+
Console.WriteLine($"\nTokens: {string.Join(",", prediction.OutputTokens)}");
63+
64+
// Expected output:
65+
// Number of Features: 282
66+
// Features: 0.0941 0.0941 0.0941 0.0941 0.0941 0.0941 0.0941 0.0941 0.0941 0.1881 ...
67+
// Tokens: ml.net's,featurizetext,api,uses,composition,basic,transforms,convert,text,numeric,features.
68+
}
69+
70+
public class TextData
71+
{
72+
public string Text { get; set; }
73+
}
74+
75+
public class TransformedTextData : TextData
76+
{
77+
public float[] Features { get; set; }
78+
public string[] OutputTokens { get; set; }
79+
}
80+
}
81+
}

src/Microsoft.ML.Transforms/Text/TextCatalog.cs

+8-1
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,13 @@ public static class TextCatalog
2222
/// <param name="catalog">The text-related transform's catalog.</param>
2323
/// <param name="outputColumnName">Name of the column resulting from the transformation of <paramref name="inputColumnName"/>.</param>
2424
/// <param name="inputColumnName">Name of the column to transform. If set to <see langword="null"/>, the value of the <paramref name="outputColumnName"/> will be used as source.</param>
25+
/// <example>
26+
/// <format type="text/markdown">
27+
/// <![CDATA[
28+
/// [!code-csharp[FeaturizeText](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/FeaturizeText.cs)]
29+
/// ]]>
30+
/// </format>
31+
/// </example>
2532
public static TextFeaturizingEstimator FeaturizeText(this TransformsCatalog.TextTransforms catalog,
2633
string outputColumnName,
2734
string inputColumnName = null)
@@ -38,7 +45,7 @@ public static TextFeaturizingEstimator FeaturizeText(this TransformsCatalog.Text
3845
/// <example>
3946
/// <format type="text/markdown">
4047
/// <![CDATA[
41-
/// [!code-csharp[FeaturizeText](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/TextTransform.cs)]
48+
/// [!code-csharp[FeaturizeText](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/FeaturizeTextWithOptions.cs)]
4249
/// ]]>
4350
/// </format>
4451
/// </example>

0 commit comments

Comments
 (0)