-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Add examples for clustering #222
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
justinormont
merged 7 commits into
dotnet:master
from
Ivanidzo4ka:ivanidze/bring_clustering
May 24, 2018
Merged
Changes from all commits
Commits
Show all changes
7 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
using Microsoft.ML.Data; | ||
using Microsoft.ML.Runtime; | ||
using Microsoft.ML.Runtime.Api; | ||
using Microsoft.ML.Trainers; | ||
using Microsoft.ML.Transforms; | ||
using System; | ||
using System.Collections.Generic; | ||
using Xunit; | ||
|
||
namespace Microsoft.ML.Scenarios | ||
{ | ||
public partial class ScenariosTests | ||
{ | ||
[Fact(Skip = "Missing data set. See https://github.com/dotnet/machinelearning/issues/203")] | ||
public void PredictNewsCluster() | ||
{ | ||
string dataPath = GetDataPath(@"external/20newsgroups.txt"); | ||
|
||
var pipeline = new LearningPipeline(); | ||
pipeline.Add(new TextLoader(dataPath).CreateFrom<NewsData>(useHeader: false, allowQuotedStrings:true, supportSparse:false)); | ||
pipeline.Add(new ColumnConcatenator("AllText", "Subject", "Content")); | ||
pipeline.Add(new TextFeaturizer("Features", "AllText") | ||
{ | ||
KeepDiacritics = false, | ||
KeepPunctuations = false, | ||
TextCase = TextNormalizerTransformCaseNormalizationMode.Lower, | ||
StopWordsRemover = new PredefinedStopWordsRemover(), | ||
VectorNormalizer = TextTransformTextNormKind.L2, | ||
CharFeatureExtractor = new NGramNgramExtractor() { NgramLength = 3, AllLengths = false }, | ||
WordFeatureExtractor = new NGramNgramExtractor() { NgramLength = 1, AllLengths = true } | ||
}); | ||
|
||
pipeline.Add(new KMeansPlusPlusClusterer() { K = 20 }); | ||
var model = pipeline.Train<NewsData, ClusteringPrediction>(); | ||
var gunResult = model.Predict(new NewsData() { Subject = "Let's disscuss gun control", Content = @"The United States has 88.8 guns per 100 people, or about 270,000,000 guns, which is the highest total and per capita number in the world. 22% of Americans own one or more guns (35% of men and 12% of women). America's pervasive gun culture stems in part from its colonial history, revolutionary roots, frontier expansion, and the Second Amendment, which states: ""A well regulated militia, | ||
being necessary to the security of a free State, | ||
the right of the people to keep and bear Arms, | ||
shall not be infringed."" | ||
|
||
Proponents of more gun control laws state that the Second Amendment was intended for militias; that gun violence would be reduced; that gun restrictions have always existed; and that a majority of Americans, including gun owners, support new gun restrictions. " }); | ||
var puppiesResult = model.Predict(new NewsData() | ||
{ | ||
Subject = "Studies Reveal Five Ways Dogs Show Us Their Love", | ||
Content = @"Let's face it: We all adore our dogs as if they were family and we tend to shower our dogs with affection in numerous ways. Perhaps you may buy your dog a favorite toy or stop by the dog bakery to order some great tasting doggy cookies, or perhaps you just love patting your dog in the evening in the way he most loves. But how do our dogs tell us they love us too? | ||
|
||
Until the day your dog can talk, you'll never likely hear him pronounce ""I love you,"" and in the meantime, don't expect him to purchase you a Hallmark card or some balloons with those renowned romantic words printed on top. Also, don’t expect a box of chocolates or a bouquet of flowers from your dog when Valentine's day is around the corner. Sometimes it might feel like we're living an uneven relationship, but just because dogs don't communicate their love the way we do, doesn't mean they don't love us!" | ||
}); | ||
} | ||
|
||
public class NewsData | ||
{ | ||
[Column(ordinal: "0")] | ||
public string Id; | ||
|
||
[Column(ordinal: "1", name: "Label")] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should these be using |
||
public string Topic; | ||
|
||
[Column(ordinal: "2")] | ||
public string Subject; | ||
|
||
[Column(ordinal: "3")] | ||
public string Content; | ||
} | ||
|
||
public class ClusteringPrediction | ||
{ | ||
[ColumnName("PredictedLabel")] | ||
public uint SelectedClusterId; | ||
[ColumnName("Score")] | ||
public float[] Distance; | ||
} | ||
|
||
public class ClusteringData | ||
{ | ||
[ColumnName("Features")] | ||
[VectorType(2)] | ||
public float[] Points; | ||
} | ||
|
||
[Fact] | ||
public void PredictClusters() | ||
{ | ||
int n = 1000; | ||
int k = 5; | ||
var rand = new Random(); | ||
var clusters = new ClusteringData[k]; | ||
var data = new ClusteringData[n]; | ||
for (int i = 0; i < k; i++) | ||
{ | ||
//pick clusters as points on circle with angle to axis X equal to 360*i/k | ||
clusters[i] = new ClusteringData { Points = new float[2] { (float)Math.Cos(Math.PI * i * 2 / k), (float)Math.Sin(Math.PI * i * 2 / k) } }; | ||
} | ||
// create data points by randomly picking cluster and shifting point slightly away from it. | ||
for (int i = 0; i < n; i++) | ||
{ | ||
var index = rand.Next(0, k); | ||
var shift = (rand.NextDouble() - 0.5) / k; | ||
data[i] = new ClusteringData | ||
{ | ||
Points = new float[2] | ||
{ | ||
(float)(clusters[index].Points[0] + shift), | ||
(float)(clusters[index].Points[1] + shift) | ||
} | ||
}; | ||
} | ||
var pipeline = new LearningPipeline(); | ||
pipeline.Add(CollectionDataSource.Create(data)); | ||
pipeline.Add(new KMeansPlusPlusClusterer() { K = k }); | ||
var model = pipeline.Train<ClusteringData, ClusteringPrediction>(); | ||
//validate that initial points we pick up as centers of cluster during data generation belong to different clusters. | ||
var labels = new HashSet<uint>(); | ||
for (int i = 0; i < k; i++) | ||
{ | ||
var scores = model.Predict(clusters[i]); | ||
Assert.True(!labels.Contains(scores.SelectedClusterId)); | ||
labels.Add(scores.SelectedClusterId); | ||
} | ||
} | ||
} | ||
} |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh good I'm glad we didn't decide to write anything controversial here. 😄