Skip to content

Converted listed text transforms into transformers/estimators. #953

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Sep 21, 2018
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions src/Microsoft.ML.Data/DataLoadSave/TransformWrapper.cs
Original file line number Diff line number Diff line change
Expand Up @@ -110,24 +110,24 @@ public TransformWrapper(IHostEnvironment env, ModelLoadContext ctx)
/// <summary>
/// Estimator for trained wrapped transformers.
/// </summary>
internal abstract class TrainedWrapperEstimatorBase : IEstimator<TransformWrapper>
public abstract class TrainedWrapperEstimatorBase : IEstimator<TransformWrapper>
{
private readonly IHost _host;
protected readonly IHost Host;

protected TrainedWrapperEstimatorBase(IHost host)
{
Contracts.CheckValue(host, nameof(host));
_host = host;
Host = host;
}

public abstract TransformWrapper Fit(IDataView input);

public SchemaShape GetOutputSchema(SchemaShape inputSchema)
{
_host.CheckValue(inputSchema, nameof(inputSchema));
Host.CheckValue(inputSchema, nameof(inputSchema));

var fakeSchema = new FakeSchema(_host, inputSchema);
var transformer = Fit(new EmptyDataView(_host, fakeSchema));
var fakeSchema = new FakeSchema(Host, inputSchema);
var transformer = Fit(new EmptyDataView(Host, fakeSchema));
return SchemaShape.Create(transformer.GetOutputSchema(fakeSchema));
}
}
Expand Down
8 changes: 4 additions & 4 deletions src/Microsoft.ML.Data/Transforms/TermEstimator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,13 @@ public static class Defaults
/// Convenience constructor for public facing API.
/// </summary>
/// <param name="env">Host Environment.</param>
/// <param name="name">Name of the output column.</param>
/// <param name="source">Name of the column to be transformed. If this is null '<paramref name="name"/>' will be used.</param>
/// <param name="inputColumn">Name of the column to be transformed.</param>
/// <param name="outputColumn">Name of the output column. If this is null '<paramref name="inputColumn"/>' will be used.</param>
/// <param name="maxNumTerms">Maximum number of terms to keep per column when auto-training.</param>
/// <param name="sort">How items should be ordered when vectorized. By default, they will be in the order encountered.
/// If by value items are sorted according to their default comparison, e.g., text sorting will be case sensitive (e.g., 'A' then 'Z' then 'a').</param>
public TermEstimator(IHostEnvironment env, string name, string source = null, int maxNumTerms = Defaults.MaxNumTerms, TermTransform.SortOrder sort = Defaults.Sort) :
this(env, new TermTransform.ColumnInfo(name, source ?? name, maxNumTerms, sort))
public TermEstimator(IHostEnvironment env, string inputColumn, string outputColumn = null, int maxNumTerms = Defaults.MaxNumTerms, TermTransform.SortOrder sort = Defaults.Sort) :
this(env, new TermTransform.ColumnInfo(inputColumn, outputColumn ?? inputColumn, maxNumTerms, sort))
{
}

Expand Down
459 changes: 459 additions & 0 deletions src/Microsoft.ML.Transforms/Text/TextStaticExtensions.cs

Large diffs are not rendered by default.

613 changes: 613 additions & 0 deletions src/Microsoft.ML.Transforms/Text/WrappedTextTransformers.cs

Large diffs are not rendered by default.

12 changes: 12 additions & 0 deletions test/BaselineOutput/SingleDebug/Text/bag_of_words.tsv

Large diffs are not rendered by default.

13 changes: 13 additions & 0 deletions test/BaselineOutput/SingleDebug/Text/ngrams.tsv

Large diffs are not rendered by default.

11 changes: 11 additions & 0 deletions test/BaselineOutput/SingleDebug/Text/words_without_stopwords.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#@ TextLoader{
#@ header+
#@ sep=tab
#@ col=text:TX:0
#@ col=words_without_stopwords:TX:1-**
#@ }
text
==rude== dude, you are rude upload that carl picture back, or else. ==rude== dude, you rude upload carl picture back, else.
== ok! == im going to vandalize wild ones wiki then!!! == ok! == im going vandalize wild ones wiki then!!!
stop trolling, zapatancas, calling me a liar merely demonstartes that you arer zapatancas. you may choose to chase every legitimate editor from this site and ignore me but i am an editor with a record that isnt 99% trolling and therefore my wishes are not to be completely ignored by a sockpuppet like yourself. the consensus is overwhelmingly against you and your trollin g lover zapatancas, stop trolling, zapatancas, calling liar merely demonstartes you arer zapatancas. you choose chase legitimate editor site ignore i editor record isnt 99% trolling wishes completely ignored sockpuppet like yourself. consensus overwhelmingly you your trollin g lover zapatancas,
==you're cool== you seem like a really cool guy... *bursts out laughing at sarcasm*. ==you're cool== you like really cool guy... *bursts laughing sarcasm*.
12 changes: 12 additions & 0 deletions test/BaselineOutput/SingleRelease/Text/bag_of_words.tsv

Large diffs are not rendered by default.

13 changes: 13 additions & 0 deletions test/BaselineOutput/SingleRelease/Text/ngrams.tsv

Large diffs are not rendered by default.

11 changes: 11 additions & 0 deletions test/BaselineOutput/SingleRelease/Text/words_without_stopwords.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#@ TextLoader{
#@ header+
#@ sep=tab
#@ col=text:TX:0
#@ col=words_without_stopwords:TX:1-**
#@ }
text
==rude== dude, you are rude upload that carl picture back, or else. ==rude== dude, you rude upload carl picture back, else.
== ok! == im going to vandalize wild ones wiki then!!! == ok! == im going vandalize wild ones wiki then!!!
stop trolling, zapatancas, calling me a liar merely demonstartes that you arer zapatancas. you may choose to chase every legitimate editor from this site and ignore me but i am an editor with a record that isnt 99% trolling and therefore my wishes are not to be completely ignored by a sockpuppet like yourself. the consensus is overwhelmingly against you and your trollin g lover zapatancas, stop trolling, zapatancas, calling liar merely demonstartes you arer zapatancas. you choose chase legitimate editor site ignore i editor record isnt 99% trolling wishes completely ignored sockpuppet like yourself. consensus overwhelmingly you your trollin g lover zapatancas,
==you're cool== you seem like a really cool guy... *bursts out laughing at sarcasm*. ==you're cool== you like really cool guy... *bursts laughing sarcasm*.
87 changes: 87 additions & 0 deletions test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -431,5 +431,92 @@ public void Tokenize()
Assert.True(type.IsVector && !type.IsKnownSizeVector && type.ItemType.IsKey);
Assert.True(type.ItemType.AsKey.RawKind == DataKind.U2);
}

[Fact]
public void NormalizeTextAndRemoveStopWords()
{
var env = new ConsoleEnvironment(seed: 0);
var dataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
var reader = TextLoader.CreateReader(env, ctx => (
label: ctx.LoadBool(0),
text: ctx.LoadText(1)), hasHeader: true);
var dataSource = new MultiFileSource(dataPath);
var data = reader.Read(dataSource);

var est = data.MakeNewEstimator()
.Append(r => (
r.label,
normalized_text: r.text.NormalizeText(),
words_without_stopwords: r.text.TokenizeText().RemoveStopwords()));

var tdata = est.Fit(data).Transform(data);
var schema = tdata.AsDynamic.Schema;

Assert.True(schema.TryGetColumnIndex("words_without_stopwords", out int stopwordsCol));
var type = schema.GetColumnType(stopwordsCol);
Assert.True(type.IsVector && !type.IsKnownSizeVector && type.ItemType.IsText);

Assert.True(schema.TryGetColumnIndex("normalized_text", out int normTextCol));
type = schema.GetColumnType(normTextCol);
Assert.True(type.IsText && !type.IsVector);
}

[Fact]
public void ConvertToWordBag()
{
var env = new ConsoleEnvironment(seed: 0);
var dataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
var reader = TextLoader.CreateReader(env, ctx => (
label: ctx.LoadBool(0),
text: ctx.LoadText(1)), hasHeader: true);
var dataSource = new MultiFileSource(dataPath);
var data = reader.Read(dataSource);

var est = data.MakeNewEstimator()
.Append(r => (
r.label,
bagofword: r.text.ToBagofWords(),
bagofhashedword: r.text.ToBagofHashedWords()));

var tdata = est.Fit(data).Transform(data);
var schema = tdata.AsDynamic.Schema;

Assert.True(schema.TryGetColumnIndex("bagofword", out int bagofwordCol));
var type = schema.GetColumnType(bagofwordCol);
Assert.True(type.IsVector && type.IsKnownSizeVector && type.ItemType.IsNumber);

Assert.True(schema.TryGetColumnIndex("bagofhashedword", out int bagofhashedwordCol));
type = schema.GetColumnType(bagofhashedwordCol);
Assert.True(type.IsVector && type.IsKnownSizeVector && type.ItemType.IsNumber);
}

[Fact]
public void Ngrams()
{
var env = new ConsoleEnvironment(seed: 0);
var dataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
var reader = TextLoader.CreateReader(env, ctx => (
label: ctx.LoadBool(0),
text: ctx.LoadText(1)), hasHeader: true);
var dataSource = new MultiFileSource(dataPath);
var data = reader.Read(dataSource);

var est = data.MakeNewEstimator()
.Append(r => (
r.label,
ngrams: r.text.TokenizeText().ToKey().ToNgrams(),
ngramshash: r.text.TokenizeText().ToKey().ToNgramsHash()));

var tdata = est.Fit(data).Transform(data);
var schema = tdata.AsDynamic.Schema;

Assert.True(schema.TryGetColumnIndex("ngrams", out int ngramsCol));
var type = schema.GetColumnType(ngramsCol);
Assert.True(type.IsVector && type.IsKnownSizeVector && type.ItemType.IsNumber);

Assert.True(schema.TryGetColumnIndex("ngramshash", out int ngramshashCol));
type = schema.GetColumnType(ngramshashCol);
Assert.True(type.IsVector && type.IsKnownSizeVector && type.ItemType.IsNumber);
}
}
}
103 changes: 103 additions & 0 deletions test/Microsoft.ML.Tests/Transformers/TextFeaturizerTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -88,5 +88,108 @@ public void TextTokenizationWorkout()
CheckEquality("Text", "tokenized.tsv");
Done();
}


[Fact]
public void TextNormalizationAndStopwordRemoverWorkout()
{
string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
var data = TextLoader.CreateReader(Env, ctx => (
label: ctx.LoadBool(0),
text: ctx.LoadText(1)), hasHeader: true)
.Read(new MultiFileSource(sentimentDataPath));

var invalidData = TextLoader.CreateReader(Env, ctx => (
label: ctx.LoadBool(0),
text: ctx.LoadFloat(1)), hasHeader: true)
.Read(new MultiFileSource(sentimentDataPath));

var est = new TextNormalizer(Env,"text")
.Append(new WordTokenizer(Env, "text", "words"))
.Append(new StopwordRemover(Env, "words", "words_without_stopwords"));
TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);

var outputPath = GetOutputPath("Text", "words_without_stopwords.tsv");
using (var ch = Env.Start("save"))
{
var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true });
IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
savedData = new ChooseColumnsTransform(Env, savedData, "text", "words_without_stopwords");

using (var fs = File.Create(outputPath))
DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
}

CheckEquality("Text", "words_without_stopwords.tsv");
Done();
}

[Fact]
public void WordBagWorkout()
{
string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
var data = TextLoader.CreateReader(Env, ctx => (
label: ctx.LoadBool(0),
text: ctx.LoadText(1)), hasHeader: true)
.Read(new MultiFileSource(sentimentDataPath));

var invalidData = TextLoader.CreateReader(Env, ctx => (
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is invalid data the same as the valid one?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Its not valid. Here text is tried to be loaded as float.


In reply to: 219582043 [](ancestors = 219582043)

label: ctx.LoadBool(0),
text: ctx.LoadFloat(1)), hasHeader: true)
.Read(new MultiFileSource(sentimentDataPath));

var est = new WordBagEstimator(Env, "text", "bag_of_words").
Append(new WordHashBagEstimator(Env, "text", "bag_of_wordshash"));
//TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);
Copy link
Contributor Author

@zeahmed zeahmed Sep 19, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

//TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); [](start = 12, length = 78)

This method is failing when the size of output vector is determined by the incoming IDataView. The method calls the fit method to learn the output schema where EmptyDataView is passed. When fitting on EmptyDataView, the size is zero which is making the output vector a variable sized vector...:)

@Zruty0, Need to find a way to handle this case. #Resolved

Copy link
Contributor Author

@zeahmed zeahmed Sep 19, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need to fix how the ngram output vector size is computed i.e. need to ensure that vector size is at least one at the following line.

types[iinfo] = new VectorType(NumberType.Float, _ngramMaps[iinfo].Count);


In reply to: 218967284 [](ancestors = 218967284)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not do this in this PR?


In reply to: 218989146 [](ancestors = 218989146,218967284)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, NgramTransform is computing metadata based on the size of ngrams. I need to deeply look into this to enable it for EmptyDataView. I am thinking to open an issue against this. What do you think?


In reply to: 219285825 [](ancestors = 219285825,218989146,218967284)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have created #969 for tracking this.


In reply to: 219287182 [](ancestors = 219287182,219285825,218989146,218967284)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this an issue on empty inputs? Is it possible to test at least the rest?


In reply to: 219315899 [](ancestors = 219315899,219287182,219285825,218989146,218967284)


var outputPath = GetOutputPath("Text", "bag_of_words.tsv");
using (var ch = Env.Start("save"))
{
var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true });
IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
savedData = new ChooseColumnsTransform(Env, savedData, "text", "bag_of_words", "bag_of_wordshash");

using (var fs = File.Create(outputPath))
DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
}

CheckEquality("Text", "bag_of_words.tsv");
Done();
}

[Fact]
public void NgramWorkout()
{
string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv");
var data = TextLoader.CreateReader(Env, ctx => (
label: ctx.LoadBool(0),
text: ctx.LoadText(1)), hasHeader: true)
.Read(new MultiFileSource(sentimentDataPath));

var invalidData = TextLoader.CreateReader(Env, ctx => (
label: ctx.LoadBool(0),
text: ctx.LoadFloat(1)), hasHeader: true)
.Read(new MultiFileSource(sentimentDataPath));

var est = new WordTokenizer(Env, "text", "text")
.Append(new TermEstimator(Env, "text", "terms"))
.Append(new NgramEstimator(Env, "terms", "ngrams"))
.Append(new NgramHashEstimator(Env, "terms", "ngramshash"));
//TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic);
Copy link
Contributor Author

@zeahmed zeahmed Sep 19, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

//TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); [](start = 12, length = 78)

same. see the comment on the test above on line 143. #Resolved

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have created #969 for tracking this.


In reply to: 218967678 [](ancestors = 218967678)


var outputPath = GetOutputPath("Text", "ngrams.tsv");
using (var ch = Env.Start("save"))
{
var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true });
IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4);
savedData = new ChooseColumnsTransform(Env, savedData, "text", "terms", "ngrams", "ngramshash");

using (var fs = File.Create(outputPath))
DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true);
}

CheckEquality("Text", "ngrams.tsv");
Done();
}
}
}