Skip to content

Converted listed text transforms into transformers/estimators. #953

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Sep 21, 2018
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions src/Microsoft.ML.Data/DataLoadSave/TransformWrapper.cs
Original file line number Diff line number Diff line change
Expand Up @@ -110,24 +110,24 @@ public TransformWrapper(IHostEnvironment env, ModelLoadContext ctx)
/// <summary>
/// Estimator for trained wrapped transformers.
/// </summary>
internal abstract class TrainedWrapperEstimatorBase : IEstimator<TransformWrapper>
public abstract class TrainedWrapperEstimatorBase : IEstimator<TransformWrapper>
{
private readonly IHost _host;
protected readonly IHost Host;

protected TrainedWrapperEstimatorBase(IHost host)
{
Contracts.CheckValue(host, nameof(host));
_host = host;
Host = host;
}

public abstract TransformWrapper Fit(IDataView input);

public SchemaShape GetOutputSchema(SchemaShape inputSchema)
{
_host.CheckValue(inputSchema, nameof(inputSchema));
Host.CheckValue(inputSchema, nameof(inputSchema));

var fakeSchema = new FakeSchema(_host, inputSchema);
var transformer = Fit(new EmptyDataView(_host, fakeSchema));
var fakeSchema = new FakeSchema(Host, inputSchema);
var transformer = Fit(new EmptyDataView(Host, fakeSchema));
return SchemaShape.Create(transformer.GetOutputSchema(fakeSchema));
}
}
Expand Down
8 changes: 4 additions & 4 deletions src/Microsoft.ML.Data/Transforms/TermEstimator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,13 @@ public static class Defaults
/// Convenience constructor for public facing API.
/// </summary>
/// <param name="env">Host Environment.</param>
/// <param name="name">Name of the output column.</param>
/// <param name="source">Name of the column to be transformed. If this is null '<paramref name="name"/>' will be used.</param>
/// <param name="inputColumn">Name of the column to be transformed.</param>
/// <param name="outputColumn">Name of the output column. If this is null '<paramref name="inputColumn"/>' will be used.</param>
/// <param name="maxNumTerms">Maximum number of terms to keep per column when auto-training.</param>
/// <param name="sort">How items should be ordered when vectorized. By default, they will be in the order encountered.
/// If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a').</param>
public TermEstimator(IHostEnvironment env, string name, string source = null, int maxNumTerms = Defaults.MaxNumTerms, TermTransform.SortOrder sort = Defaults.Sort) :
this(env, new TermTransform.ColumnInfo(name, source ?? name, maxNumTerms, sort))
public TermEstimator(IHostEnvironment env, string inputColumn, string outputColumn = null, int maxNumTerms = Defaults.MaxNumTerms, TermTransform.SortOrder sort = Defaults.Sort) :
this(env, new TermTransform.ColumnInfo(inputColumn, outputColumn ?? inputColumn, maxNumTerms, sort))
{
}

Expand Down
459 changes: 459 additions & 0 deletions src/Microsoft.ML.Transforms/Text/TextStaticExtensions.cs

Large diffs are not rendered by default.

613 changes: 613 additions & 0 deletions src/Microsoft.ML.Transforms/Text/WrappedTextTransformers.cs

Large diffs are not rendered by default.

12 changes: 12 additions & 0 deletions test/BaselineOutput/SingleDebug/Text/bag_of_words.tsv

Large diffs are not rendered by default.

13 changes: 13 additions & 0 deletions test/BaselineOutput/SingleDebug/Text/ngrams.tsv

Large diffs are not rendered by default.

11 changes: 11 additions & 0 deletions test/BaselineOutput/SingleDebug/Text/words_without_stopwords.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#@ TextLoader{
#@ header+
#@ sep=tab
#@ col=text:TX:0
#@ col=words_without_stopwords:TX:1-**
#@ }
text
==rude== dude, you are rude upload that carl picture back, or else. ==rude== dude, you rude upload carl picture back, else.
== ok! == im going to vandalize wild ones wiki then!!! == ok! == im going vandalize wild ones wiki then!!!
stop trolling, zapatancas, calling me a liar merely demonstartes that you arer zapatancas. you may choose to chase every legitimate editor from this site and ignore me but i am an editor with a record that isnt 99% trolling and therefore my wishes are not to be completely ignored by a sockpuppet like yourself. the consensus is overwhelmingly against you and your trollin g lover zapatancas, stop trolling, zapatancas, calling liar merely demonstartes you arer zapatancas. you choose chase legitimate editor site ignore i editor record isnt 99% trolling wishes completely ignored sockpuppet like yourself. consensus overwhelmingly you your trollin g lover zapatancas,
==you're cool== you seem like a really cool guy... *bursts out laughing at sarcasm*. ==you're cool== you like really cool guy... *bursts laughing sarcasm*.
12 changes: 12 additions & 0 deletions test/BaselineOutput/SingleRelease/Text/bag_of_words.tsv

Large diffs are not rendered by default.

13 changes: 13 additions & 0 deletions test/BaselineOutput/SingleRelease/Text/ngrams.tsv

Large diffs are not rendered by default.

11 changes: 11 additions & 0 deletions test/BaselineOutput/SingleRelease/Text/words_without_stopwords.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#@ TextLoader{
#@ header+
#@ sep=tab
#@ col=text:TX:0
#@ col=words_without_stopwords:TX:1-**
#@ }
text
==rude== dude, you are rude upload that carl picture back, or else. ==rude== dude, you rude upload carl picture back, else.
== ok! == im going to vandalize wild ones wiki then!!! == ok! == im going vandalize wild ones wiki then!!!
stop trolling, zapatancas, calling me a liar merely demonstartes that you arer zapatancas. you may choose to chase every legitimate editor from this site and ignore me but i am an editor with a record that isnt 99% trolling and therefore my wishes are not to be completely ignored by a sockpuppet like yourself. the consensus is overwhelmingly against you and your trollin g lover zapatancas, stop trolling, zapatancas, calling liar merely demonstartes you arer zapatancas. you choose chase legitimate editor site ignore i editor record isnt 99% trolling wishes completely ignored sockpuppet like yourself. consensus overwhelmingly you your trollin g lover zapatancas,
==you're cool== you seem like a really cool guy... *bursts out laughing at sarcasm*. ==you're cool== you like really cool guy... *bursts laughing sarcasm*.
87 changes: 87 additions & 0 deletions test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -431,5 +431,92 @@ public void Tokenize()
Assert.True(type.IsVector && !type.IsKnownSizeVector && type.ItemType.IsKey);
Assert.True(type.ItemType.AsKey.RawKind == DataKind.U2);
}

[Fact]
public void NormalizeTextAndRemoveStopWords()
{
var env = new ConsoleEnvironment(seed: 0);
var dataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
var reader = TextLoader.CreateReader(env, ctx => (
label: ctx.LoadBool(0),
text: ctx.LoadText(1)), hasHeader: true);
var dataSource = new MultiFileSource(dataPath);
var data = reader.Read(dataSource);

var est = data.MakeNewEstimator()
.Append(r => (
r.label,
normalized_text: r.text.NormalizeText(),
words_without_stopwords: r.text.TokenizeText().RemoveStopwords()));

var tdata = est.Fit(data).Transform(data);
var schema = tdata.AsDynamic.Schema;

Assert.True(schema.TryGetColumnIndex("words_without_stopwords", out int stopwordsCol));
var type = schema.GetColumnType(stopwordsCol);
Assert.True(type.IsVector && !type.IsKnownSizeVector && type.ItemType.IsText);

Assert.True(schema.TryGetColumnIndex("normalized_text", out int normTextCol));
type = schema.GetColumnType(normTextCol);
Assert.True(type.IsText && !type.IsVector);
}

[Fact]
public void ConvertToWordBag()
{
var env = new ConsoleEnvironment(seed: 0);
var dataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
var reader = TextLoader.CreateReader(env, ctx => (
label: ctx.LoadBool(0),
text: ctx.LoadText(1)), hasHeader: true);
var dataSource = new MultiFileSource(dataPath);
var data = reader.Read(dataSource);

var est = data.MakeNewEstimator()
.Append(r => (
r.label,
bagofword: r.text.ToBagofWords(),
bagofhashedword: r.text.ToBagofHashedWords()));

var tdata = est.Fit(data).Transform(data);
var schema = tdata.AsDynamic.Schema;

Assert.True(schema.TryGetColumnIndex("bagofword", out int bagofwordCol));
var type = schema.GetColumnType(bagofwordCol);
Assert.True(type.IsVector && type.IsKnownSizeVector && type.ItemType.IsNumber);

Assert.True(schema.TryGetColumnIndex("bagofhashedword", out int bagofhashedwordCol));
type = schema.GetColumnType(bagofhashedwordCol);
Assert.True(type.IsVector && type.IsKnownSizeVector && type.ItemType.IsNumber);
}

[Fact]
public void Ngrams()
{
var env = new ConsoleEnvironment(seed: 0);
var dataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
var reader = TextLoader.CreateReader(env, ctx => (
label: ctx.LoadBool(0),
text: ctx.LoadText(1)), hasHeader: true);
var dataSource = new MultiFileSource(dataPath);
var data = reader.Read(dataSource);

var est = data.MakeNewEstimator()
.Append(r => (
r.label,
ngrams: r.text.TokenizeText().ToKey().ToNgrams(),
ngramshash: r.text.TokenizeText().ToKey().ToNgramsHash()));

var tdata = est.Fit(data).Transform(data);
var schema = tdata.AsDynamic.Schema;

Assert.True(schema.TryGetColumnIndex("ngrams", out int ngramsCol));
var type = schema.GetColumnType(ngramsCol);
Assert.True(type.IsVector && type.IsKnownSizeVector && type.ItemType.IsNumber);

Assert.True(schema.TryGetColumnIndex("ngramshash", out int ngramshashCol));
type = schema.GetColumnType(ngramshashCol);
Assert.True(type.IsVector && type.IsKnownSizeVector && type.ItemType.IsNumber);
}
}
}
109 changes: 109 additions & 0 deletions test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -88,5 +88,114 @@ public void TextTokenizationWorkout()
CheckEquality("Text", "tokenized.tsv");
Done();
}


[Fact]
public void TextNormalizationAndStopwordRemoverWorkout()
{
string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
var data = TextLoader.CreateReader(Env, ctx => (
label: ctx.LoadBool(0),
text: ctx.LoadText(1)), hasHeader: true)
.Read(new MultiFileSource(sentimentDataPath));

var invalidData = TextLoader.CreateReader(Env, ctx => (
label: ctx.LoadBool(0),
text: ctx.LoadFloat(1)), hasHeader: true)
.Read(new MultiFileSource(sentimentDataPath));

var est = new TextNormalizer(Env,"text")
.Append(new WordTokenizer(Env, "text", "words"))
.Append(new StopwordRemover(Env, "words", "words_without_stopwords"));
TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

var outputPath = GetOutputPath("Text", "words_without_stopwords.tsv");
using (var ch = Env.Start("save"))
{
var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true });
IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
savedData = new ChooseColumnsTransform(Env, savedData, "text", "words_without_stopwords");

using (var fs = File.Create(outputPath))
DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
}

CheckEquality("Text", "words_without_stopwords.tsv");
Done();
}

[Fact]
public void WordBagWorkout()
{
string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
var data = TextLoader.CreateReader(Env, ctx => (
label: ctx.LoadBool(0),
text: ctx.LoadText(1)), hasHeader: true)
.Read(new MultiFileSource(sentimentDataPath));

var invalidData = TextLoader.CreateReader(Env, ctx => (
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is invalid data the same as the valid one?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Its not valid. Here text is tried to be loaded as float.


In reply to: 219582043 [](ancestors = 219582043)

label: ctx.LoadBool(0),
text: ctx.LoadFloat(1)), hasHeader: true)
.Read(new MultiFileSource(sentimentDataPath));

var est = new WordBagEstimator(Env, "text", "bag_of_words").
Append(new WordHashBagEstimator(Env, "text", "bag_of_wordshash"));

// The following call fails because of the following issue
// https://github.com/dotnet/machinelearning/issues/969
// TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

var outputPath = GetOutputPath("Text", "bag_of_words.tsv");
using (var ch = Env.Start("save"))
{
var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true });
IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
savedData = new ChooseColumnsTransform(Env, savedData, "text", "bag_of_words", "bag_of_wordshash");

using (var fs = File.Create(outputPath))
DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
}

CheckEquality("Text", "bag_of_words.tsv");
Done();
}

[Fact]
public void NgramWorkout()
{
string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
var data = TextLoader.CreateReader(Env, ctx => (
label: ctx.LoadBool(0),
text: ctx.LoadText(1)), hasHeader: true)
.Read(new MultiFileSource(sentimentDataPath));

var invalidData = TextLoader.CreateReader(Env, ctx => (
label: ctx.LoadBool(0),
text: ctx.LoadFloat(1)), hasHeader: true)
.Read(new MultiFileSource(sentimentDataPath));

var est = new WordTokenizer(Env, "text", "text")
.Append(new TermEstimator(Env, "text", "terms"))
.Append(new NgramEstimator(Env, "terms", "ngrams"))
.Append(new NgramHashEstimator(Env, "terms", "ngramshash"));

// The following call fails because of the following issue
// https://github.com/dotnet/machinelearning/issues/969
// TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

var outputPath = GetOutputPath("Text", "ngrams.tsv");
using (var ch = Env.Start("save"))
{
var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true });
IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
savedData = new ChooseColumnsTransform(Env, savedData, "text", "terms", "ngrams", "ngramshash");

using (var fs = File.Create(outputPath))
DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
}

CheckEquality("Text", "ngrams.tsv");
Done();
}
}
}