Skip to content

Add LDA example to Microsoft.ML.Samples #1782

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Nov 29, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 63 additions & 0 deletions docs/samples/Microsoft.ML.Samples/Dynamic/LdaTransform.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
using Microsoft.ML.Data;
using Microsoft.ML.Runtime.Api;
using Microsoft.ML.Runtime.Data;
using System;
using System.Collections.Generic;

namespace Microsoft.ML.Samples.Dynamic
{
public class LdaTransformExample
{
public static void LdaTransform()
{
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
// as well as the source of randomness.
var ml = new MLContext();

// Get a small dataset as an IEnumerable.
IEnumerable<SamplesUtils.DatasetUtils.SampleTopicsData> data = SamplesUtils.DatasetUtils.GetTopicsData();
var trainData = ml.CreateStreamingDataView(data);

// Preview of one of the columns of the the topics data.
// The Review column contains the keys associated with a particular body of text.
//
// Review
// "animals birds cats dogs fish horse"
// "horse birds house fish duck cats"
// "car truck driver bus pickup"
// "car truck driver bus pickup horse"

string review = nameof(SamplesUtils.DatasetUtils.SampleTopicsData.Review);
string ldaFeatures = "LdaFeatures";

// A pipeline for featurizing the "Review" column
var pipeline = ml.Transforms.Text.ProduceWordBags(review).
Append(ml.Transforms.Text.LatentDirichletAllocation(review, ldaFeatures, numTopic:3));

// The transformed data
var transformer = pipeline.Fit(trainData);
var transformed_data = transformer.Transform(trainData);

// Column obtained after processing the input.
var ldaFeaturesColumn = transformed_data.GetColumn<VBuffer<float>>(ml, ldaFeatures);

Console.WriteLine($"{ldaFeatures} column obtained post-transformation.");
foreach (var featureRow in ldaFeaturesColumn)
{
foreach (var value in featureRow.GetValues())
Console.Write($"{value} ");
Console.WriteLine("");
}

Console.WriteLine("===================================================");

// LdaFeatures column obtained post-transformation.
// For LDA, we had specified numTopic:3. Hence each row of text has been featurized as a vector of floats with length 3.

//0.1818182 0.4545455 0.3636364
//0.3636364 0.1818182 0.4545455
//0.2222222 0.2222222 0.5555556
//0.2727273 0.09090909 0.6363636
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
<NativeAssemblyReference Include="CpuMathNative" />
<NativeAssemblyReference Include="FastTreeNative" />
<NativeAssemblyReference Include="MatrixFactorizationNative" />
<NativeAssemblyReference Include="LdaNative" />

<ProjectReference Include="..\..\..\src\Microsoft.ML.Analyzer\Microsoft.ML.Analyzer.csproj">
<ReferenceOutputAssembly>false</ReferenceOutputAssembly>
Expand Down
2 changes: 1 addition & 1 deletion docs/samples/Microsoft.ML.Samples/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ internal static class Program
{
static void Main(string[] args)
{
MatrixFactorizationExample.MatrixFactorizationInMemoryData();
LdaTransformExample.LdaTransform();
}
}
}
7 changes: 7 additions & 0 deletions src/Microsoft.ML.Transforms/Text/TextCatalog.cs
Original file line number Diff line number Diff line change
Expand Up @@ -507,6 +507,13 @@ public static NgramHashEstimator ProduceHashedNgrams(this TransformsCatalog.Text
/// <param name="numSummaryTermPerTopic">The number of words to summarize the topic.</param>
/// <param name="numBurninIterations">The number of burn-in iterations.</param>
/// <param name="resetRandomGenerator">Reset the random number generator for each document.</param>
/// <example>
/// <format type="text/markdown">
/// <![CDATA[
/// [!code-csharp[LatentDirichletAllocation](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/LdaTransform.cs)]
/// ]]>
/// </format>
/// </example>
public static LatentDirichletAllocationEstimator LatentDirichletAllocation(this TransformsCatalog.TextTransforms catalog,
string inputColumn,
string outputColumn = null,
Expand Down