Skip to content

Commit cbd8ad3

Browse files
committed
Adding LDA sample to Microsoft.ML.Samples
1 parent 533e186 commit cbd8ad3

File tree

4 files changed

+75
-1
lines changed

4 files changed

+75
-1
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
using Microsoft.ML.Data;
2+
using Microsoft.ML.Runtime.Api;
3+
using Microsoft.ML.Runtime.Data;
4+
using System;
5+
using System.Collections.Generic;
6+
7+
namespace Microsoft.ML.Samples.Dynamic
8+
{
9+
public class LdaTransformExample
10+
{
11+
public static void LdaTransform()
12+
{
13+
// Create a new ML context, for ML.NET operations. It can be used for exception tracking and logging,
14+
// as well as the source of randomness.
15+
var ml = new MLContext();
16+
17+
// Get a small dataset as an IEnumerable.
18+
IEnumerable<SamplesUtils.DatasetUtils.SampleTopicsData> data = SamplesUtils.DatasetUtils.GetTopicsData();
19+
var trainData = ml.CreateStreamingDataView(data);
20+
21+
// Preview of one of the columns of the the topics data.
22+
// The Review column contains the keys associated with a particular body of text.
23+
//
24+
// Review
25+
// "animals birds cats dogs fish horse"
26+
// "horse birds house fish duck cats"
27+
// "car truck driver bus pickup"
28+
// "car truck driver bus pickup horse"
29+
30+
// A pipeline for featurizing the "Review" column
31+
string ldaFeatures = "LdaFeatures";
32+
var pipeline = ml.Transforms.Text.ProduceWordBags("Review").
33+
Append(ml.Transforms.Text.LatentDirichletAllocation("Review", ldaFeatures, numTopic:3));
34+
35+
// The transformed data
36+
var transformer = pipeline.Fit(trainData);
37+
var transformed_data = transformer.Transform(trainData);
38+
39+
// Small helper to print the text inside the columns, in the console.
40+
Action<string, IEnumerable<VBuffer<float>>> printHelper = (columnName, column) =>
41+
{
42+
Console.WriteLine($"{columnName} column obtained post-transformation.");
43+
foreach (var featureRow in column)
44+
{
45+
foreach (var value in featureRow.GetValues())
46+
Console.Write($"{value} ");
47+
Console.WriteLine("");
48+
}
49+
50+
Console.WriteLine("===================================================");
51+
};
52+
53+
// Preview of the column obtained after processing the input.
54+
var defaultColumn = transformed_data.GetColumn<VBuffer<float>>(ml, ldaFeatures);
55+
printHelper(ldaFeatures, defaultColumn);
56+
57+
// LdaFeatures column obtained post-transformation.
58+
// For LDA, we had specified numTopic:3. Hence each row of text has been featurized as a vector of floats with length 3.
59+
60+
//0.1818182 0.4545455 0.3636364
61+
//0.3636364 0.1818182 0.4545455
62+
//0.2222222 0.2222222 0.5555556
63+
//0.2727273 0.09090909 0.6363636
64+
}
65+
}
66+
}

docs/samples/Microsoft.ML.Samples/Microsoft.ML.Samples.csproj

+1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
<NativeAssemblyReference Include="CpuMathNative" />
1818
<NativeAssemblyReference Include="FastTreeNative" />
1919
<NativeAssemblyReference Include="MatrixFactorizationNative" />
20+
<NativeAssemblyReference Include="LdaNative" />
2021

2122
<ProjectReference Include="..\..\..\src\Microsoft.ML.Analyzer\Microsoft.ML.Analyzer.csproj">
2223
<ReferenceOutputAssembly>false</ReferenceOutputAssembly>

docs/samples/Microsoft.ML.Samples/Program.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ internal static class Program
66
{
77
static void Main(string[] args)
88
{
9-
MatrixFactorizationExample.MatrixFactorizationInMemoryData();
9+
LdaTransformExample.LdaTransform();
1010
}
1111
}
1212
}

src/Microsoft.ML.Transforms/Text/TextCatalog.cs

+7
Original file line numberDiff line numberDiff line change
@@ -507,6 +507,13 @@ public static NgramHashEstimator ProduceHashedNgrams(this TransformsCatalog.Text
507507
/// <param name="numSummaryTermPerTopic">The number of words to summarize the topic.</param>
508508
/// <param name="numBurninIterations">The number of burn-in iterations.</param>
509509
/// <param name="resetRandomGenerator">Reset the random number generator for each document.</param>
510+
/// <example>
511+
/// <format type="text/markdown">
512+
/// <![CDATA[
513+
/// [!code-csharp[LatentDirichletAllocation](~/../docs/samples/docs/samples/Microsoft.ML.Samples/Dynamic/LdaTransform.cs)]
514+
/// ]]>
515+
/// </format>
516+
/// </example>
510517
public static LatentDirichletAllocationEstimator LatentDirichletAllocation(this TransformsCatalog.TextTransforms catalog,
511518
string inputColumn,
512519
string outputColumn = null,

0 commit comments

Comments
 (0)