-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Created samples for 'ProduceWordBags' and 'ProduceHashedWordBags' API. #3183
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
6 commits
Select commit
Hold shift + click to select a range
9471b5d
Created samples for 'ProduceWordBags' and 'ProduceHashedWordBags' API.
zeahmed 1b2daca
Updated comments!
zeahmed 202f051
Addressed reviewers' comments.
zeahmed d288792
Addressed reviewers' comments.
zeahmed bdca2a5
Changed input/output classes to private.
zeahmed bc60f09
Addressed reviewers' comments.
zeahmed File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
89 changes: 89 additions & 0 deletions
89
docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceHashedWordBags.cs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
using System; | ||
using System.Collections.Generic; | ||
using System.Text; | ||
using Microsoft.ML.Data; | ||
using Microsoft.ML.Transforms.Text; | ||
|
||
namespace Microsoft.ML.Samples.Dynamic | ||
{ | ||
public static class ProduceHashedWordBags | ||
{ | ||
public static void Example() | ||
{ | ||
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, | ||
// as well as the source of randomness. | ||
var mlContext = new MLContext(); | ||
|
||
// Create a small dataset as an IEnumerable. | ||
var samples = new List<TextData>() | ||
{ | ||
new TextData(){ Text = "This is an example to compute bag-of-word features using hashing." }, | ||
new TextData(){ Text = "ML.NET's ProduceHashedWordBags API produces count of n-grams and hashes it as an index into a vector of given bit length." }, | ||
new TextData(){ Text = "It does so by first tokenizing text/string into words/tokens then " }, | ||
new TextData(){ Text = "computing n-grams and hash them to the index given by hash value." }, | ||
new TextData(){ Text = "The hashing reduces the size of the output feature vector" }, | ||
new TextData(){ Text = "which is useful in case when number of n-grams is very large." }, | ||
}; | ||
|
||
// Convert training data to IDataView. | ||
var dataview = mlContext.Data.LoadFromEnumerable(samples); | ||
|
||
// A pipeline for converting text into numeric bag-of-word features using hashing. | ||
// The following call to 'ProduceHashedWordBags' implicitly tokenizes the text/string into words/tokens. | ||
// Please note that the length of the output feature vector depends on the 'numberOfBits' settings. | ||
var textPipeline = mlContext.Transforms.Text.ProduceHashedWordBags("BagOfWordFeatures", "Text", | ||
numberOfBits: 5, | ||
ngramLength: 3, | ||
useAllLengths: false, | ||
maximumNumberOfInverts: 1); | ||
|
||
// Fit to data. | ||
var textTransformer = textPipeline.Fit(dataview); | ||
var transformedDataView = textTransformer.Transform(dataview); | ||
|
||
// Create the prediction engine to get the bag-of-word features extracted from the text. | ||
var predictionEngine = mlContext.Model.CreatePredictionEngine<TextData, TransformedTextData>(textTransformer); | ||
|
||
// Convert the text into numeric features. | ||
var prediction = predictionEngine.Predict(samples[0]); | ||
|
||
// Print the length of the feature vector. | ||
Console.WriteLine($"Number of Features: {prediction.BagOfWordFeatures.Length}"); | ||
|
||
// Preview of the produced n-grams. | ||
// Get the slot names from the column's metadata. | ||
// The slot names for a vector column corresponds to the names associated with each position in the vector. | ||
VBuffer<ReadOnlyMemory<char>> slotNames = default; | ||
transformedDataView.Schema["BagOfWordFeatures"].GetSlotNames(ref slotNames); | ||
var BagOfWordFeaturesColumn = transformedDataView.GetColumn<VBuffer<float>>(transformedDataView.Schema["BagOfWordFeatures"]); | ||
var slots = slotNames.GetValues(); | ||
Console.Write("N-grams: "); | ||
foreach (var featureRow in BagOfWordFeaturesColumn) | ||
{ | ||
foreach (var item in featureRow.Items()) | ||
Console.Write($"{slots[item.Key]} "); | ||
Console.WriteLine(); | ||
} | ||
|
||
// Print the first 10 feature values. | ||
Console.Write("Features: "); | ||
for (int i = 0; i < 10; i++) | ||
Console.Write($"{prediction.BagOfWordFeatures[i]:F4} "); | ||
|
||
// Expected output: | ||
// Number of Features: 32 | ||
// N-grams: an|example|to is|an|example example|to|compute This|is|an compute|bag-of-word|features bag-of-word|features|using to|compute|bag-of-word ML.NET's|ProduceHashedWordBags|API as|an|index API|produces|count ... | ||
// Features: 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 1.0000 2.0000 0.0000 0.0000 ... | ||
} | ||
|
||
private class TextData | ||
{ | ||
public string Text { get; set; } | ||
} | ||
|
||
private class TransformedTextData : TextData | ||
{ | ||
public float[] BagOfWordFeatures { get; set; } | ||
} | ||
} | ||
} |
89 changes: 89 additions & 0 deletions
89
docs/samples/Microsoft.ML.Samples/Dynamic/Transforms/Text/ProduceWordBags.cs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
using System; | ||
using System.Collections.Generic; | ||
using System.Text; | ||
using Microsoft.ML.Data; | ||
using Microsoft.ML.Transforms.Text; | ||
|
||
namespace Microsoft.ML.Samples.Dynamic | ||
{ | ||
public static class ProduceWordBags | ||
{ | ||
public static void Example() | ||
{ | ||
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging, | ||
// as well as the source of randomness. | ||
var mlContext = new MLContext(); | ||
|
||
// Create a small dataset as an IEnumerable. | ||
var samples = new List<TextData>() | ||
{ | ||
new TextData(){ Text = "This is an example to compute bag-of-word features." }, | ||
new TextData(){ Text = "ML.NET's ProduceWordBags API produces bag-of-word features from input text." }, | ||
new TextData(){ Text = "It does so by first tokenizing text/string into words/tokens then " }, | ||
new TextData(){ Text = "computing n-grams and their neumeric values." }, | ||
new TextData(){ Text = "Each position in the output vector corresponds to a particular n-gram." }, | ||
new TextData(){ Text = "The value at each position corresponds to," }, | ||
new TextData(){ Text = "the number of times n-gram occured in the data (Tf), or" }, | ||
new TextData(){ Text = "the inverse of the number of documents contain the n-gram (Idf)," }, | ||
new TextData(){ Text = "or compute both and multipy together (Tf-Idf)." }, | ||
}; | ||
|
||
// Convert training data to IDataView. | ||
var dataview = mlContext.Data.LoadFromEnumerable(samples); | ||
|
||
// A pipeline for converting text into numeric bag-of-word features. | ||
// The following call to 'ProduceWordBags' implicitly tokenizes the text/string into words/tokens. | ||
// Please note that the length of the output feature vector depends on the n-gram settings. | ||
var textPipeline = mlContext.Transforms.Text.ProduceWordBags("BagOfWordFeatures", "Text", | ||
ngramLength: 3, useAllLengths: false, weighting: NgramExtractingEstimator.WeightingCriteria.Tf); | ||
|
||
// Fit to data. | ||
var textTransformer = textPipeline.Fit(dataview); | ||
var transformedDataView = textTransformer.Transform(dataview); | ||
|
||
// Create the prediction engine to get the bag-of-word features extracted from the text. | ||
var predictionEngine = mlContext.Model.CreatePredictionEngine<TextData, TransformedTextData>(textTransformer); | ||
|
||
// Convert the text into numeric features. | ||
var prediction = predictionEngine.Predict(samples[0]); | ||
|
||
// Print the length of the feature vector. | ||
Console.WriteLine($"Number of Features: {prediction.BagOfWordFeatures.Length}"); | ||
|
||
// Preview of the produced n-grams. | ||
// Get the slot names from the column's metadata. | ||
// The slot names for a vector column corresponds to the names associated with each position in the vector. | ||
VBuffer<ReadOnlyMemory<char>> slotNames = default; | ||
transformedDataView.Schema["BagOfWordFeatures"].GetSlotNames(ref slotNames); | ||
var BagOfWordFeaturesColumn = transformedDataView.GetColumn<VBuffer<float>>(transformedDataView.Schema["BagOfWordFeatures"]); | ||
var slots = slotNames.GetValues(); | ||
Console.Write("N-grams: "); | ||
foreach (var featureRow in BagOfWordFeaturesColumn) | ||
{ | ||
foreach (var item in featureRow.Items()) | ||
Console.Write($"{slots[item.Key]} "); | ||
Console.WriteLine(); | ||
} | ||
|
||
// Print the first 10 feature values. | ||
Console.Write("Features: "); | ||
for (int i = 0; i < 10; i++) | ||
Console.Write($"{prediction.BagOfWordFeatures[i]:F4} "); | ||
|
||
// Expected output: | ||
// Number of Features: 62 | ||
// N-grams: This|is|an is|an|example an|example|to example|to|compute to|compute|bag-of-word compute|bag-of-word|features. ML.NET's|ProduceWordBags|API ProduceWordBags|API|produces API|produces|bag-of-word produces|bag-of-word|features ... | ||
// Features: 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 0.0000 0.0000 0.0000 0.0000 ... | ||
} | ||
|
||
private class TextData | ||
{ | ||
public string Text { get; set; } | ||
} | ||
|
||
private class TransformedTextData : TextData | ||
{ | ||
public float[] BagOfWordFeatures { get; set; } | ||
} | ||
} | ||
} |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why
predictionEngine
rather thanTakeRows
andConvertToEnumerable
? I would use the one we would recommend people to use to inspect data in practice.