Skip to content

Commit 4e0800c

Browse files
Anipikjustinormont
authored andcommitted
WordEmbedding Tests added plus added dimension check for the first row (#880)
* wordEMbedding * warning removed * class renamed
1 parent 2e1fa4e commit 4e0800c

File tree

2 files changed

+32
-9
lines changed

2 files changed

+32
-9
lines changed

src/Microsoft.ML.Transforms/Text/WordEmbeddingsTransform.cs

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -426,13 +426,14 @@ private Model GetVocabularyDictionary()
426426
dimension = wordsInFirstLine.Length - 1;
427427
if (model == null)
428428
model = new Model(dimension);
429-
float temp;
430-
string firstKey = wordsInFirstLine[0];
431-
float[] firstValue = wordsInFirstLine.Skip(1).Select(x => float.TryParse(x, out temp) ? temp : Single.NaN).ToArray();
432-
if (!firstValue.Contains(Single.NaN))
433-
model.AddWordVector(ch, firstKey, firstValue);
434-
else
435-
ch.Warning($"Parsing error while reading model file: '{_modelFileNameWithPath}', line number 1");
429+
if (model.Dimension == dimension)
430+
{
431+
float temp;
432+
string firstKey = wordsInFirstLine[0];
433+
float[] firstValue = wordsInFirstLine.Skip(1).Select(x => float.TryParse(x, out temp) ? temp : Single.NaN).ToArray();
434+
if (!firstValue.Contains(Single.NaN))
435+
model.AddWordVector(ch, firstKey, firstValue);
436+
}
436437
pch.Checkpoint(lineNumber);
437438
}
438439
}

test/Microsoft.ML.Benchmarks/BigramAndTrigramBenchMark.cs renamed to test/Microsoft.ML.Benchmarks/Text/MultiClassClassification.cs

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,16 @@ internal class EmptyWriter : TextWriter
2020
public override Encoding Encoding => null;
2121
}
2222

23-
public class BigramAndTrigramBenchmark
23+
public class MultiClassClassification
2424
{
2525
private string _dataPath_Wiki;
2626
private string _modelPath_Wiki;
2727

2828
[GlobalSetup(Targets = new string[] {
2929
nameof(CV_Multiclass_WikiDetox_BigramsAndTrichar_OVAAveragedPerceptron),
30-
nameof(CV_Multiclass_WikiDetox_BigramsAndTrichar_LightGBMMulticlass) })]
30+
nameof(CV_Multiclass_WikiDetox_BigramsAndTrichar_LightGBMMulticlass),
31+
nameof(CV_Multiclass_WikiDetox_WordEmbeddings_OVAAveragedPerceptron),
32+
nameof(CV_Multiclass_WikiDetox_WordEmbeddings_SDCAMC)})]
3133
public void SetupTrainingSpeedTests()
3234
{
3335
_dataPath_Wiki = Path.GetFullPath(TestDatasets.WikiDetox.trainFilename);
@@ -81,5 +83,25 @@ public void Test_Multiclass_WikiDetox_BigramsAndTrichar_OVAAveragedPerceptron()
8183
Maml.MainCore(tlc, cmd, alwaysPrintStacktrace: false);
8284
}
8385
}
86+
87+
[Benchmark]
88+
public void CV_Multiclass_WikiDetox_WordEmbeddings_OVAAveragedPerceptron()
89+
{
90+
string cmd = @"CV tr=OVA{p=AveragedPerceptron{iter=10}} k=5 loader=TextLoader{quote=- sparse=- col=Label:R4:0 col=rev_id:TX:1 col=comment:TX:2 col=logged_in:BL:4 col=ns:TX:5 col=sample:TX:6 col=split:TX:7 col=year:R4:3 header=+} data=" + _dataPath_Wiki + " xf=Convert{col=logged_in type=R4} xf=CategoricalTransform{col=ns} xf=TextTransform{col=FeaturesText:comment tokens=+ wordExtractor=NGramExtractorTransform{ngram=2}} xf=WordEmbeddingsTransform{col=FeaturesWordEmbedding:FeaturesText_TransformedText model=FastTextWikipedia300D} xf=Concat{col=Features:FeaturesText,FeaturesWordEmbedding,logged_in,ns}";
91+
using (var tlc = new TlcEnvironment(verbose: false, sensitivity: MessageSensitivity.None, outWriter: EmptyWriter.Instance))
92+
{
93+
Maml.MainCore(tlc, cmd, alwaysPrintStacktrace: false);
94+
}
95+
}
96+
97+
[Benchmark]
98+
public void CV_Multiclass_WikiDetox_WordEmbeddings_SDCAMC()
99+
{
100+
string cmd = @"CV tr=SDCAMC k=5 loader=TextLoader{quote=- sparse=- col=Label:R4:0 col=rev_id:TX:1 col=comment:TX:2 col=logged_in:BL:4 col=ns:TX:5 col=sample:TX:6 col=split:TX:7 col=year:R4:3 header=+} data=" + _dataPath_Wiki + " xf=Convert{col=logged_in type=R4} xf=CategoricalTransform{col=ns} xf=TextTransform{col=FeaturesText:comment tokens=+ wordExtractor={} charExtractor={}} xf=WordEmbeddingsTransform{col=FeaturesWordEmbedding:FeaturesText_TransformedText model=FastTextWikipedia300D} xf=Concat{col=Features:FeaturesWordEmbedding,logged_in,ns}";
101+
using (var tlc = new TlcEnvironment(verbose: false, sensitivity: MessageSensitivity.None, outWriter: EmptyWriter.Instance))
102+
{
103+
Maml.MainCore(tlc, cmd, alwaysPrintStacktrace: false);
104+
}
105+
}
84106
}
85107
}

0 commit comments

Comments
 (0)