Skip to content

Commit 24645ff

Browse files
authored
Created samples for 'ProduceWordBags' and 'ProduceHashedWordBags' API. (#3183)
1 parent e54f295 commit 24645ff

File tree

2 files changed

+178
-0
lines changed

2 files changed

+178
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.Text;
4+
using Microsoft.ML.Data;
5+
using Microsoft.ML.Transforms.Text;
6+
7+
namespace Microsoft.ML.Samples.Dynamic
8+
{
9+
public static class ProduceHashedWordBags
10+
{
11+
public static void Example()
12+
{
13+
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
14+
// as well as the source of randomness.
15+
var mlContext = new MLContext();
16+
17+
// Create a small dataset as an IEnumerable.
18+
var samples = new List<TextData>()
19+
{
20+
new TextData(){ Text = "This is an example to compute bag-of-word features using hashing." },
21+
new TextData(){ Text = "ML.NET's ProduceHashedWordBags API produces count of n-grams and hashes it as an index into a vector of given bit length." },
22+
new TextData(){ Text = "It does so by first tokenizing text/string into words/tokens then " },
23+
new TextData(){ Text = "computing n-grams and hash them to the index given by hash value." },
24+
new TextData(){ Text = "The hashing reduces the size of the output feature vector" },
25+
new TextData(){ Text = "which is useful in case when number of n-grams is very large." },
26+
};
27+
28+
// Convert training data to IDataView.
29+
var dataview = mlContext.Data.LoadFromEnumerable(samples);
30+
31+
// A pipeline for converting text into numeric bag-of-word features using hashing.
32+
// The following call to 'ProduceHashedWordBags' implicitly tokenizes the text/string into words/tokens.
33+
// Please note that the length of the output feature vector depends on the 'numberOfBits' settings.
34+
var textPipeline = mlContext.Transforms.Text.ProduceHashedWordBags("BagOfWordFeatures", "Text",
35+
numberOfBits: 5,
36+
ngramLength: 3,
37+
useAllLengths: false,
38+
maximumNumberOfInverts: 1);
39+
40+
// Fit to data.
41+
var textTransformer = textPipeline.Fit(dataview);
42+
var transformedDataView = textTransformer.Transform(dataview);
43+
44+
// Create the prediction engine to get the bag-of-word features extracted from the text.
45+
var predictionEngine = mlContext.Model.CreatePredictionEngine<TextData, TransformedTextData>(textTransformer);
46+
47+
// Convert the text into numeric features.
48+
var prediction = predictionEngine.Predict(samples[0]);
49+
50+
// Print the length of the feature vector.
51+
Console.WriteLine($"Number of Features: {prediction.BagOfWordFeatures.Length}");
52+
53+
// Preview of the produced n-grams.
54+
// Get the slot names from the column's metadata.
55+
// The slot names for a vector column corresponds to the names associated with each position in the vector.
56+
VBuffer<ReadOnlyMemory<char>> slotNames = default;
57+
transformedDataView.Schema["BagOfWordFeatures"].GetSlotNames(ref slotNames);
58+
var BagOfWordFeaturesColumn = transformedDataView.GetColumn<VBuffer<float>>(transformedDataView.Schema["BagOfWordFeatures"]);
59+
var slots = slotNames.GetValues();
60+
Console.Write("N-grams: ");
61+
foreach (var featureRow in BagOfWordFeaturesColumn)
62+
{
63+
foreach (var item in featureRow.Items())
64+
Console.Write($"{slots[item.Key]} ");
65+
Console.WriteLine();
66+
}
67+
68+
// Print the first 10 feature values.
69+
Console.Write("Features: ");
70+
for (int i = 0; i < 10; i++)
71+
Console.Write($"{prediction.BagOfWordFeatures[i]:F4} ");
72+
73+
// Expected output:
74+
// Number of Features: 32
75+
// N-grams: an|example|to is|an|example example|to|compute This|is|an compute|bag-of-word|features bag-of-word|features|using to|compute|bag-of-word ML.NET's|ProduceHashedWordBags|API as|an|index API|produces|count ...
76+
// Features: 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 1.0000 2.0000 0.0000 0.0000 ...
77+
}
78+
79+
private class TextData
80+
{
81+
public string Text { get; set; }
82+
}
83+
84+
private class TransformedTextData : TextData
85+
{
86+
public float[] BagOfWordFeatures { get; set; }
87+
}
88+
}
89+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
using System;
2+
using System.Collections.Generic;
3+
using System.Text;
4+
using Microsoft.ML.Data;
5+
using Microsoft.ML.Transforms.Text;
6+
7+
namespace Microsoft.ML.Samples.Dynamic
8+
{
9+
public static class ProduceWordBags
10+
{
11+
public static void Example()
12+
{
13+
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
14+
// as well as the source of randomness.
15+
var mlContext = new MLContext();
16+
17+
// Create a small dataset as an IEnumerable.
18+
var samples = new List<TextData>()
19+
{
20+
new TextData(){ Text = "This is an example to compute bag-of-word features." },
21+
new TextData(){ Text = "ML.NET's ProduceWordBags API produces bag-of-word features from input text." },
22+
new TextData(){ Text = "It does so by first tokenizing text/string into words/tokens then " },
23+
new TextData(){ Text = "computing n-grams and their neumeric values." },
24+
new TextData(){ Text = "Each position in the output vector corresponds to a particular n-gram." },
25+
new TextData(){ Text = "The value at each position corresponds to," },
26+
new TextData(){ Text = "the number of times n-gram occured in the data (Tf), or" },
27+
new TextData(){ Text = "the inverse of the number of documents contain the n-gram (Idf)," },
28+
new TextData(){ Text = "or compute both and multipy together (Tf-Idf)." },
29+
};
30+
31+
// Convert training data to IDataView.
32+
var dataview = mlContext.Data.LoadFromEnumerable(samples);
33+
34+
// A pipeline for converting text into numeric bag-of-word features.
35+
// The following call to 'ProduceWordBags' implicitly tokenizes the text/string into words/tokens.
36+
// Please note that the length of the output feature vector depends on the n-gram settings.
37+
var textPipeline = mlContext.Transforms.Text.ProduceWordBags("BagOfWordFeatures", "Text",
38+
ngramLength: 3, useAllLengths: false, weighting: NgramExtractingEstimator.WeightingCriteria.Tf);
39+
40+
// Fit to data.
41+
var textTransformer = textPipeline.Fit(dataview);
42+
var transformedDataView = textTransformer.Transform(dataview);
43+
44+
// Create the prediction engine to get the bag-of-word features extracted from the text.
45+
var predictionEngine = mlContext.Model.CreatePredictionEngine<TextData, TransformedTextData>(textTransformer);
46+
47+
// Convert the text into numeric features.
48+
var prediction = predictionEngine.Predict(samples[0]);
49+
50+
// Print the length of the feature vector.
51+
Console.WriteLine($"Number of Features: {prediction.BagOfWordFeatures.Length}");
52+
53+
// Preview of the produced n-grams.
54+
// Get the slot names from the column's metadata.
55+
// The slot names for a vector column corresponds to the names associated with each position in the vector.
56+
VBuffer<ReadOnlyMemory<char>> slotNames = default;
57+
transformedDataView.Schema["BagOfWordFeatures"].GetSlotNames(ref slotNames);
58+
var BagOfWordFeaturesColumn = transformedDataView.GetColumn<VBuffer<float>>(transformedDataView.Schema["BagOfWordFeatures"]);
59+
var slots = slotNames.GetValues();
60+
Console.Write("N-grams: ");
61+
foreach (var featureRow in BagOfWordFeaturesColumn)
62+
{
63+
foreach (var item in featureRow.Items())
64+
Console.Write($"{slots[item.Key]} ");
65+
Console.WriteLine();
66+
}
67+
68+
// Print the first 10 feature values.
69+
Console.Write("Features: ");
70+
for (int i = 0; i < 10; i++)
71+
Console.Write($"{prediction.BagOfWordFeatures[i]:F4} ");
72+
73+
// Expected output:
74+
// Number of Features: 62
75+
// N-grams: This|is|an is|an|example an|example|to example|to|compute to|compute|bag-of-word compute|bag-of-word|features. ML.NET's|ProduceWordBags|API ProduceWordBags|API|produces API|produces|bag-of-word produces|bag-of-word|features ...
76+
// Features: 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 0.0000 0.0000 0.0000 0.0000 ...
77+
}
78+
79+
private class TextData
80+
{
81+
public string Text { get; set; }
82+
}
83+
84+
private class TransformedTextData : TextData
85+
{
86+
public float[] BagOfWordFeatures { get; set; }
87+
}
88+
}
89+
}

0 commit comments

Comments
 (0)