-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Converted listed text transforms into transformers/estimators. #953
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 11 commits
73c2aa8
d145d07
6bd2bf4
e8ef7ef
1350587
450efda
db261f8
1653e1c
4a93c72
6b28ca6
ee835d1
4659b80
da08376
9688edd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
#@ TextLoader{ | ||
#@ header+ | ||
#@ sep=tab | ||
#@ col=text:TX:0 | ||
#@ col=words_without_stopwords:TX:1-** | ||
#@ } | ||
text | ||
==rude== dude, you are rude upload that carl picture back, or else. ==rude== dude, you rude upload carl picture back, else. | ||
== ok! == im going to vandalize wild ones wiki then!!! == ok! == im going vandalize wild ones wiki then!!! | ||
stop trolling, zapatancas, calling me a liar merely demonstartes that you arer zapatancas. you may choose to chase every legitimate editor from this site and ignore me but i am an editor with a record that isnt 99% trolling and therefore my wishes are not to be completely ignored by a sockpuppet like yourself. the consensus is overwhelmingly against you and your trollin g lover zapatancas, stop trolling, zapatancas, calling liar merely demonstartes you arer zapatancas. you choose chase legitimate editor site ignore i editor record isnt 99% trolling wishes completely ignored sockpuppet like yourself. consensus overwhelmingly you your trollin g lover zapatancas, | ||
==you're cool== you seem like a really cool guy... *bursts out laughing at sarcasm*. ==you're cool== you like really cool guy... *bursts laughing sarcasm*. |
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
#@ TextLoader{ | ||
#@ header+ | ||
#@ sep=tab | ||
#@ col=text:TX:0 | ||
#@ col=words_without_stopwords:TX:1-** | ||
#@ } | ||
text | ||
==rude== dude, you are rude upload that carl picture back, or else. ==rude== dude, you rude upload carl picture back, else. | ||
== ok! == im going to vandalize wild ones wiki then!!! == ok! == im going vandalize wild ones wiki then!!! | ||
stop trolling, zapatancas, calling me a liar merely demonstartes that you arer zapatancas. you may choose to chase every legitimate editor from this site and ignore me but i am an editor with a record that isnt 99% trolling and therefore my wishes are not to be completely ignored by a sockpuppet like yourself. the consensus is overwhelmingly against you and your trollin g lover zapatancas, stop trolling, zapatancas, calling liar merely demonstartes you arer zapatancas. you choose chase legitimate editor site ignore i editor record isnt 99% trolling wishes completely ignored sockpuppet like yourself. consensus overwhelmingly you your trollin g lover zapatancas, | ||
==you're cool== you seem like a really cool guy... *bursts out laughing at sarcasm*. ==you're cool== you like really cool guy... *bursts laughing sarcasm*. |
Original file line number | Diff line number | Diff line change | ||
---|---|---|---|---|
|
@@ -88,5 +88,108 @@ public void TextTokenizationWorkout() | |||
CheckEquality("Text", "tokenized.tsv"); | ||||
Done(); | ||||
} | ||||
|
||||
|
||||
[Fact] | ||||
public void TextNormalizationAndStopwordRemoverWorkout() | ||||
{ | ||||
string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); | ||||
var data = TextLoader.CreateReader(Env, ctx => ( | ||||
label: ctx.LoadBool(0), | ||||
text: ctx.LoadText(1)), hasHeader: true) | ||||
.Read(new MultiFileSource(sentimentDataPath)); | ||||
|
||||
var invalidData = TextLoader.CreateReader(Env, ctx => ( | ||||
label: ctx.LoadBool(0), | ||||
text: ctx.LoadFloat(1)), hasHeader: true) | ||||
.Read(new MultiFileSource(sentimentDataPath)); | ||||
|
||||
var est = new TextNormalizer(Env,"text") | ||||
.Append(new WordTokenizer(Env, "text", "words")) | ||||
.Append(new StopwordRemover(Env, "words", "words_without_stopwords")); | ||||
TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); | ||||
|
||||
var outputPath = GetOutputPath("Text", "words_without_stopwords.tsv"); | ||||
using (var ch = Env.Start("save")) | ||||
{ | ||||
var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true }); | ||||
IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); | ||||
savedData = new ChooseColumnsTransform(Env, savedData, "text", "words_without_stopwords"); | ||||
|
||||
using (var fs = File.Create(outputPath)) | ||||
DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); | ||||
} | ||||
|
||||
CheckEquality("Text", "words_without_stopwords.tsv"); | ||||
Done(); | ||||
} | ||||
|
||||
[Fact] | ||||
public void WordBagWorkout() | ||||
{ | ||||
string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); | ||||
var data = TextLoader.CreateReader(Env, ctx => ( | ||||
label: ctx.LoadBool(0), | ||||
text: ctx.LoadText(1)), hasHeader: true) | ||||
.Read(new MultiFileSource(sentimentDataPath)); | ||||
|
||||
var invalidData = TextLoader.CreateReader(Env, ctx => ( | ||||
label: ctx.LoadBool(0), | ||||
text: ctx.LoadFloat(1)), hasHeader: true) | ||||
.Read(new MultiFileSource(sentimentDataPath)); | ||||
|
||||
var est = new WordBagEstimator(Env, "text", "bag_of_words"). | ||||
Append(new WordHashBagEstimator(Env, "text", "bag_of_wordshash")); | ||||
//TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
This method is failing when the size of output vector is determined by the incoming IDataView. The method calls the fit method to learn the output schema where @Zruty0, Need to find a way to handle this case. #Resolved There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Need to fix how the ngram output vector size is computed i.e. need to ensure that vector size is at least one at the following line.
In reply to: 218967284 [](ancestors = 218967284) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually, NgramTransform is computing metadata based on the size of ngrams. I need to deeply look into this to enable it for EmptyDataView. I am thinking to open an issue against this. What do you think? In reply to: 219285825 [](ancestors = 219285825,218989146,218967284) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this an issue on empty inputs? Is it possible to test at least the rest? In reply to: 219315899 [](ancestors = 219315899,219287182,219285825,218989146,218967284) |
||||
|
||||
var outputPath = GetOutputPath("Text", "bag_of_words.tsv"); | ||||
using (var ch = Env.Start("save")) | ||||
{ | ||||
var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true }); | ||||
IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); | ||||
savedData = new ChooseColumnsTransform(Env, savedData, "text", "bag_of_words", "bag_of_wordshash"); | ||||
|
||||
using (var fs = File.Create(outputPath)) | ||||
DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); | ||||
} | ||||
|
||||
CheckEquality("Text", "bag_of_words.tsv"); | ||||
Done(); | ||||
} | ||||
|
||||
[Fact] | ||||
public void NgramWorkout() | ||||
{ | ||||
string sentimentDataPath = GetDataPath("wikipedia-detox-250-line-data.tsv"); | ||||
var data = TextLoader.CreateReader(Env, ctx => ( | ||||
label: ctx.LoadBool(0), | ||||
text: ctx.LoadText(1)), hasHeader: true) | ||||
.Read(new MultiFileSource(sentimentDataPath)); | ||||
|
||||
var invalidData = TextLoader.CreateReader(Env, ctx => ( | ||||
label: ctx.LoadBool(0), | ||||
text: ctx.LoadFloat(1)), hasHeader: true) | ||||
.Read(new MultiFileSource(sentimentDataPath)); | ||||
|
||||
var est = new WordTokenizer(Env, "text", "text") | ||||
.Append(new TermEstimator(Env, "text", "terms")) | ||||
.Append(new NgramEstimator(Env, "terms", "ngrams")) | ||||
.Append(new NgramHashEstimator(Env, "terms", "ngramshash")); | ||||
//TestEstimatorCore(est, data.AsDynamic, invalidInput: invalidData.AsDynamic); | ||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
same. see the comment on the test above on line 143. #Resolved There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||||
|
||||
var outputPath = GetOutputPath("Text", "ngrams.tsv"); | ||||
using (var ch = Env.Start("save")) | ||||
{ | ||||
var saver = new TextSaver(Env, new TextSaver.Arguments { Silent = true }); | ||||
IDataView savedData = TakeFilter.Create(Env, est.Fit(data.AsDynamic).Transform(data.AsDynamic), 4); | ||||
savedData = new ChooseColumnsTransform(Env, savedData, "text", "terms", "ngrams", "ngramshash"); | ||||
|
||||
using (var fs = File.Create(outputPath)) | ||||
DataSaverUtils.SaveDataView(ch, saver, savedData, fs, keepHidden: true); | ||||
} | ||||
|
||||
CheckEquality("Text", "ngrams.tsv"); | ||||
Done(); | ||||
} | ||||
} | ||||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why is invalid data the same as the valid one?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Its not valid. Here text is tried to be loaded as float.
In reply to: 219582043 [](ancestors = 219582043)