Skip to content

Add examples for clustering #222

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
May 24, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions test/Microsoft.ML.Tests/Microsoft.ML.Tests.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
</PropertyGroup>

<ItemGroup>
<ProjectReference Include="..\..\src\Microsoft.ML.KMeansClustering\Microsoft.ML.KMeansClustering.csproj" />
<ProjectReference Include="..\..\src\Microsoft.ML.PCA\Microsoft.ML.PCA.csproj" />
<ProjectReference Include="..\..\src\Microsoft.ML.PipelineInference\Microsoft.ML.PipelineInference.csproj" />
<ProjectReference Include="..\..\src\Microsoft.ML.StandardLearners\Microsoft.ML.StandardLearners.csproj" />
Expand Down
121 changes: 121 additions & 0 deletions test/Microsoft.ML.Tests/Scenarios/ClusteringTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
using Microsoft.ML.Data;
using Microsoft.ML.Runtime;
using Microsoft.ML.Runtime.Api;
using Microsoft.ML.Trainers;
using Microsoft.ML.Transforms;
using System;
using System.Collections.Generic;
using Xunit;

namespace Microsoft.ML.Scenarios
{
public partial class ScenariosTests
{
[Fact(Skip = "Missing data set. See https://github.com/dotnet/machinelearning/issues/203")]
public void PredictNewsCluster()
{
string dataPath = GetDataPath(@"external/20newsgroups.txt");

var pipeline = new LearningPipeline();
pipeline.Add(new TextLoader(dataPath).CreateFrom<NewsData>(useHeader: false, allowQuotedStrings:true, supportSparse:false));
pipeline.Add(new ColumnConcatenator("AllText", "Subject", "Content"));
pipeline.Add(new TextFeaturizer("Features", "AllText")
{
KeepDiacritics = false,
KeepPunctuations = false,
TextCase = TextNormalizerTransformCaseNormalizationMode.Lower,
StopWordsRemover = new PredefinedStopWordsRemover(),
VectorNormalizer = TextTransformTextNormKind.L2,
CharFeatureExtractor = new NGramNgramExtractor() { NgramLength = 3, AllLengths = false },
WordFeatureExtractor = new NGramNgramExtractor() { NgramLength = 1, AllLengths = true }
});

pipeline.Add(new KMeansPlusPlusClusterer() { K = 20 });
var model = pipeline.Train<NewsData, ClusteringPrediction>();
var gunResult = model.Predict(new NewsData() { Subject = "Let's disscuss gun control", Content = @"The United States has 88.8 guns per 100 people, or about 270,000,000 guns, which is the highest total and per capita number in the world. 22% of Americans own one or more guns (35% of men and 12% of women). America's pervasive gun culture stems in part from its colonial history, revolutionary roots, frontier expansion, and the Second Amendment, which states: ""A well regulated militia,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh good I'm glad we didn't decide to write anything controversial here. 😄

being necessary to the security of a free State,
the right of the people to keep and bear Arms,
shall not be infringed.""

Proponents of more gun control laws state that the Second Amendment was intended for militias; that gun violence would be reduced; that gun restrictions have always existed; and that a majority of Americans, including gun owners, support new gun restrictions. " });
var puppiesResult = model.Predict(new NewsData()
{
Subject = "Studies Reveal Five Ways Dogs Show Us Their Love",
Content = @"Let's face it: We all adore our dogs as if they were family and we tend to shower our dogs with affection in numerous ways. Perhaps you may buy your dog a favorite toy or stop by the dog bakery to order some great tasting doggy cookies, or perhaps you just love patting your dog in the evening in the way he most loves. But how do our dogs tell us they love us too?

Until the day your dog can talk, you'll never likely hear him pronounce ""I love you,"" and in the meantime, don't expect him to purchase you a Hallmark card or some balloons with those renowned romantic words printed on top. Also, don’t expect a box of chocolates or a bouquet of flowers from your dog when Valentine's day is around the corner. Sometimes it might feel like we're living an uneven relationship, but just because dogs don't communicate their love the way we do, doesn't mean they don't love us!"
});
}

public class NewsData
{
[Column(ordinal: "0")]
public string Id;

[Column(ordinal: "1", name: "Label")]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should these be using DefaultColumnNames?

public string Topic;

[Column(ordinal: "2")]
public string Subject;

[Column(ordinal: "3")]
public string Content;
}

public class ClusteringPrediction
{
[ColumnName("PredictedLabel")]
public uint SelectedClusterId;
[ColumnName("Score")]
public float[] Distance;
}

public class ClusteringData
{
[ColumnName("Features")]
[VectorType(2)]
public float[] Points;
}

[Fact]
public void PredictClusters()
{
int n = 1000;
int k = 5;
var rand = new Random();
var clusters = new ClusteringData[k];
var data = new ClusteringData[n];
for (int i = 0; i < k; i++)
{
//pick clusters as points on circle with angle to axis X equal to 360*i/k
clusters[i] = new ClusteringData { Points = new float[2] { (float)Math.Cos(Math.PI * i * 2 / k), (float)Math.Sin(Math.PI * i * 2 / k) } };
}
// create data points by randomly picking cluster and shifting point slightly away from it.
for (int i = 0; i < n; i++)
{
var index = rand.Next(0, k);
var shift = (rand.NextDouble() - 0.5) / k;
data[i] = new ClusteringData
{
Points = new float[2]
{
(float)(clusters[index].Points[0] + shift),
(float)(clusters[index].Points[1] + shift)
}
};
}
var pipeline = new LearningPipeline();
pipeline.Add(CollectionDataSource.Create(data));
pipeline.Add(new KMeansPlusPlusClusterer() { K = k });
var model = pipeline.Train<ClusteringData, ClusteringPrediction>();
//validate that initial points we pick up as centers of cluster during data generation belong to different clusters.
var labels = new HashSet<uint>();
for (int i = 0; i < k; i++)
{
var scores = model.Predict(clusters[i]);
Assert.True(!labels.Contains(scores.SelectedClusterId));
labels.Add(scores.SelectedClusterId);
}
}
}
}