Skip to content

Created samples for 'ProduceWordBags' and 'ProduceHashedWordBags' API. #3183

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Apr 4, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
using System;
using System.Collections.Generic;
using System.Text;
using Microsoft.ML.Data;
using Microsoft.ML.Transforms.Text;

namespace Microsoft.ML.Samples.Dynamic
{
public static class ProduceHashedWordBags
{
public static void Example()
{
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
// as well as the source of randomness.
var mlContext = new MLContext();

// Create a small dataset as an IEnumerable.
var samples = new List<TextData>()
{
new TextData(){ Text = "This is an example to compute bag-of-word features using hashing." },
new TextData(){ Text = "ML.NET's ProduceHashedWordBags API produces count of n-grams and hashes it as an index into a vector of given bit length." },
new TextData(){ Text = "It does so by first tokenizing text/string into words/tokens then " },
new TextData(){ Text = "computing n-grams and hash them to the index given by hash value." },
new TextData(){ Text = "The hashing reduces the size of the output feature vector" },
new TextData(){ Text = "which is useful in case when number of n-grams is very large." },
};

// Convert training data to IDataView.
var dataview = mlContext.Data.LoadFromEnumerable(samples);

// A pipeline for converting text into numeric bag-of-word features using hashing.
// The following call to 'ProduceHashedWordBags' implicitly tokenizes the text/string into words/tokens.
// Please note that the length of the output feature vector depends on the 'numberOfBits' settings.
var textPipeline = mlContext.Transforms.Text.ProduceHashedWordBags("BagOfWordFeatures", "Text",
numberOfBits: 5,
ngramLength: 3,
useAllLengths: false,
maximumNumberOfInverts: 1);

// Fit to data.
var textTransformer = textPipeline.Fit(dataview);
var transformedDataView = textTransformer.Transform(dataview);

// Create the prediction engine to get the bag-of-word features extracted from the text.
var predictionEngine = mlContext.Model.CreatePredictionEngine<TextData, TransformedTextData>(textTransformer);

// Convert the text into numeric features.
var prediction = predictionEngine.Predict(samples[0]);

// Print the length of the feature vector.
Console.WriteLine($"Number of Features: {prediction.BagOfWordFeatures.Length}");

// Preview of the produced n-grams.
// Get the slot names from the column's metadata.
// The slot names for a vector column corresponds to the names associated with each position in the vector.
VBuffer<ReadOnlyMemory<char>> slotNames = default;
transformedDataView.Schema["BagOfWordFeatures"].GetSlotNames(ref slotNames);
var BagOfWordFeaturesColumn = transformedDataView.GetColumn<VBuffer<float>>(transformedDataView.Schema["BagOfWordFeatures"]);
var slots = slotNames.GetValues();
Console.Write("N-grams: ");
foreach (var featureRow in BagOfWordFeaturesColumn)
{
foreach (var item in featureRow.Items())
Console.Write($"{slots[item.Key]} ");
Console.WriteLine();
}

// Print the first 10 feature values.
Console.Write("Features: ");
for (int i = 0; i < 10; i++)
Console.Write($"{prediction.BagOfWordFeatures[i]:F4} ");

// Expected output:
// Number of Features: 32
// N-grams: an|example|to is|an|example example|to|compute This|is|an compute|bag-of-word|features bag-of-word|features|using to|compute|bag-of-word ML.NET's|ProduceHashedWordBags|API as|an|index API|produces|count ...
// Features: 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 1.0000 2.0000 0.0000 0.0000 ...
}

private class TextData
{
public string Text { get; set; }
}

private class TransformedTextData : TextData
{
public float[] BagOfWordFeatures { get; set; }
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
using System;
using System.Collections.Generic;
using System.Text;
using Microsoft.ML.Data;
using Microsoft.ML.Transforms.Text;

namespace Microsoft.ML.Samples.Dynamic
{
public static class ProduceWordBags
{
public static void Example()
{
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
// as well as the source of randomness.
var mlContext = new MLContext();

// Create a small dataset as an IEnumerable.
var samples = new List<TextData>()
{
new TextData(){ Text = "This is an example to compute bag-of-word features." },
new TextData(){ Text = "ML.NET's ProduceWordBags API produces bag-of-word features from input text." },
new TextData(){ Text = "It does so by first tokenizing text/string into words/tokens then " },
new TextData(){ Text = "computing n-grams and their neumeric values." },
new TextData(){ Text = "Each position in the output vector corresponds to a particular n-gram." },
new TextData(){ Text = "The value at each position corresponds to," },
new TextData(){ Text = "the number of times n-gram occured in the data (Tf), or" },
new TextData(){ Text = "the inverse of the number of documents contain the n-gram (Idf)," },
new TextData(){ Text = "or compute both and multipy together (Tf-Idf)." },
};

// Convert training data to IDataView.
var dataview = mlContext.Data.LoadFromEnumerable(samples);

// A pipeline for converting text into numeric bag-of-word features.
// The following call to 'ProduceWordBags' implicitly tokenizes the text/string into words/tokens.
// Please note that the length of the output feature vector depends on the n-gram settings.
var textPipeline = mlContext.Transforms.Text.ProduceWordBags("BagOfWordFeatures", "Text",
ngramLength: 3, useAllLengths: false, weighting: NgramExtractingEstimator.WeightingCriteria.Tf);

// Fit to data.
var textTransformer = textPipeline.Fit(dataview);
var transformedDataView = textTransformer.Transform(dataview);

// Create the prediction engine to get the bag-of-word features extracted from the text.
var predictionEngine = mlContext.Model.CreatePredictionEngine<TextData, TransformedTextData>(textTransformer);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why predictionEngine rather than TakeRows and ConvertToEnumerable? I would use the one we would recommend people to use to inspect data in practice.


// Convert the text into numeric features.
var prediction = predictionEngine.Predict(samples[0]);

// Print the length of the feature vector.
Console.WriteLine($"Number of Features: {prediction.BagOfWordFeatures.Length}");

// Preview of the produced n-grams.
// Get the slot names from the column's metadata.
// The slot names for a vector column corresponds to the names associated with each position in the vector.
VBuffer<ReadOnlyMemory<char>> slotNames = default;
transformedDataView.Schema["BagOfWordFeatures"].GetSlotNames(ref slotNames);
var BagOfWordFeaturesColumn = transformedDataView.GetColumn<VBuffer<float>>(transformedDataView.Schema["BagOfWordFeatures"]);
var slots = slotNames.GetValues();
Console.Write("N-grams: ");
foreach (var featureRow in BagOfWordFeaturesColumn)
{
foreach (var item in featureRow.Items())
Console.Write($"{slots[item.Key]} ");
Console.WriteLine();
}

// Print the first 10 feature values.
Console.Write("Features: ");
for (int i = 0; i < 10; i++)
Console.Write($"{prediction.BagOfWordFeatures[i]:F4} ");

// Expected output:
// Number of Features: 62
// N-grams: This|is|an is|an|example an|example|to example|to|compute to|compute|bag-of-word compute|bag-of-word|features. ML.NET's|ProduceWordBags|API ProduceWordBags|API|produces API|produces|bag-of-word produces|bag-of-word|features ...
// Features: 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 0.0000 0.0000 0.0000 0.0000 ...
}

private class TextData
{
public string Text { get; set; }
}

private class TransformedTextData : TextData
{
public float[] BagOfWordFeatures { get; set; }
}
}
}