Skip to content

Commit 135791d

Browse files
committed
Fixes data invariant format problems
The tests do not pass on machines that have different formatting than English language. The error happens since the results are written in different than expected format. 1. The main fix is to imbue en-US culture to the test thread so that results will be output in format that is comparable with the test format. 2. A secondary fix is to make comparisons between culture sensitive data type representations invariant when they do not have human readable dimensions. In OptimizationMonitor.cs case the cast between culture sensitive floating point and string will cause orders of magnitudes of error in output results. The intention of this path is not to offer a robust solution and remove future issues. There is room for refactoring where, for instance, locale information would be applied to input and output and logging/tracing would be clearly separated from another kind of locale sensitive handling. This way culture sensitive parts would be separated and particular output formats could be tested as separate cases if so desired. Fixes dotnet#74
1 parent 3780923 commit 135791d

File tree

13 files changed

+74
-59
lines changed

13 files changed

+74
-59
lines changed

src/Microsoft.ML.Core/Environment/TlcEnvironment.cs

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
using System;
88
using System.Collections.Concurrent;
99
using System.Collections.Generic;
10+
using System.Globalization;
1011
using System.IO;
1112
using System.Linq;
1213
using System.Threading;
@@ -212,7 +213,7 @@ public void GetAndPrintAllProgress(ProgressReporting.ProgressTracker progressTra
212213
PrintOperationStop(_out, ev);
213214
break;
214215
case ProgressReporting.ProgressEvent.EventKind.Progress:
215-
_out.Write("[{0}] ", ev.Index);
216+
_out.Write(string.Format(CultureInfo.InvariantCulture, "[{0}] ", ev.Index));
216217
PrintProgressLine(_out, ev);
217218
break;
218219
}
@@ -225,7 +226,7 @@ public void GetAndPrintAllProgress(ProgressReporting.ProgressTracker progressTra
225226

226227
if (PrintDot())
227228
{
228-
// We need to print an extended status line. At this point, every event should be
229+
// We need to print an extended status line. At this point, every event should be
229230
// a non-checkpoint progress event.
230231
bool needPrepend = entries.Count > 1;
231232
foreach (var ev in entries)
@@ -236,7 +237,7 @@ public void GetAndPrintAllProgress(ProgressReporting.ProgressTracker progressTra
236237
{
237238
EnsureNewLine();
238239
WriteAndReturnLinePrefix(MessageSensitivity.None, _out);
239-
_out.Write("[{0}] ", ev.Index);
240+
_out.Write(string.Format(CultureInfo.InvariantCulture, "[{0}] ", ev.Index));
240241
}
241242
else
242243
{
@@ -252,24 +253,24 @@ public void GetAndPrintAllProgress(ProgressReporting.ProgressTracker progressTra
252253

253254
private static void PrintOperationStart(TextWriter writer, ProgressReporting.ProgressEvent ev)
254255
{
255-
writer.WriteLine("[{0}] '{1}' started.", ev.Index, ev.Name);
256+
writer.WriteLine(string.Format(CultureInfo.InvariantCulture, "[{0}] '{1}' started.", ev.Index, ev.Name));
256257
}
257258

258259
private static void PrintOperationStop(TextWriter writer, ProgressReporting.ProgressEvent ev)
259260
{
260-
writer.WriteLine("[{0}] '{1}' finished in {2}.", ev.Index, ev.Name, ev.EventTime - ev.StartTime);
261+
writer.WriteLine(string.Format(CultureInfo.InvariantCulture, "[{0}] '{1}' finished in {2}.", ev.Index, ev.Name, ev.EventTime - ev.StartTime));
261262
}
262263

263264
private void PrintProgressLine(TextWriter writer, ProgressReporting.ProgressEvent ev)
264265
{
265266
// Elapsed time.
266267
var elapsed = ev.EventTime - ev.StartTime;
267268
if (elapsed.TotalMinutes < 1)
268-
writer.Write("(00:{0:00.00})", elapsed.TotalSeconds);
269+
writer.Write(string.Format(CultureInfo.InvariantCulture, "(00:{0:00.00})", elapsed.TotalSeconds));
269270
else if (elapsed.TotalHours < 1)
270-
writer.Write("({0:00}:{1:00.0})", elapsed.Minutes, elapsed.TotalSeconds - 60 * elapsed.Minutes);
271+
writer.Write(string.Format(CultureInfo.InvariantCulture, "({0:00}:{1:00.0})", elapsed.Minutes, elapsed.TotalSeconds - 60 * elapsed.Minutes));
271272
else
272-
writer.Write("({0:00}:{1:00}:{2:00})", elapsed.Hours, elapsed.Minutes, elapsed.Seconds);
273+
writer.Write(string.Format(CultureInfo.InvariantCulture, "({0:00}:{1:00}:{2:00})", elapsed.Hours, elapsed.Minutes, elapsed.Seconds));
273274

274275
// Progress units.
275276
bool first = true;
@@ -281,7 +282,7 @@ private void PrintProgressLine(TextWriter writer, ProgressReporting.ProgressEven
281282
first = false;
282283
writer.Write("{0}", ev.ProgressEntry.Progress[i]);
283284
if (ev.ProgressEntry.ProgressLim[i] != null)
284-
writer.Write("/{0}", ev.ProgressEntry.ProgressLim[i].Value);
285+
writer.Write("/{0}", ev.ProgressEntry.ProgressLim[i].Value.ToString(CultureInfo.InvariantCulture));
285286
writer.Write(" {0}", ev.ProgressEntry.Header.UnitNames[i]);
286287
}
287288

@@ -291,7 +292,7 @@ private void PrintProgressLine(TextWriter writer, ProgressReporting.ProgressEven
291292
if (ev.ProgressEntry.Metrics[i] == null)
292293
continue;
293294
// REVIEW: print metrics prettier.
294-
writer.Write("\t{0}: {1}", ev.ProgressEntry.Header.MetricNames[i], ev.ProgressEntry.Metrics[i].Value);
295+
writer.Write("\t{0}: {1}", ev.ProgressEntry.Header.MetricNames[i], ev.ProgressEntry.Metrics[i].Value.ToString(CultureInfo.InvariantCulture));
295296
}
296297

297298
writer.WriteLine();
@@ -306,7 +307,7 @@ private void EnsureNewLine(bool isError = false)
306307
return;
307308

308309
// If _err and _out is the same writer, we need to print new line as well.
309-
// If _out and _err writes to Console.Out and Console.Error respectively,
310+
// If _out and _err writes to Console.Out and Console.Error respectively,
310311
// in the general user scenario they ends up with writing to the same underlying stream,.
311312
// so write a new line to the stream anyways.
312313
if (isError && _err != _out && (_out != Console.Out || _err != Console.Error))

src/Microsoft.ML.Data/Utilities/TimerScope.cs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
// See the LICENSE file in the project root for more information.
44

55
using System;
6+
using System.Globalization;
67
using Microsoft.ML.Runtime;
78
using Microsoft.ML.Runtime.Data;
89

@@ -46,7 +47,7 @@ public void Dispose()
4647

4748
// REVIEW: This is \n\n is to prevent changes across bunch of baseline files.
4849
// Ideally we should change our comparison method to ignore empty lines.
49-
_ch.Info("{0}\t Time elapsed(s): {1}\n\n", DateTime.Now, elapsedSeconds);
50+
_ch.Info("{0}\t Time elapsed(s): {1}\n\n", DateTime.Now.ToString(CultureInfo.InvariantCulture), elapsedSeconds.ToString(CultureInfo.InvariantCulture));
5051

5152
using (var pipe = _host.StartPipe<TelemetryMessage>("TelemetryPipe"))
5253
{

src/Microsoft.ML.FastTree/Training/Test.cs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
using System;
66
using System.Collections.Generic;
7+
using System.Globalization;
78
using System.Linq;
89
using System.Threading;
910
using System.Threading.Tasks;
@@ -191,7 +192,7 @@ public virtual string FormatInfoString()
191192
var sb = new System.Text.StringBuilder();
192193
foreach (var r in ComputeTests())
193194
{
194-
sb.AppendFormat("{0}.{1}={2}\n", ScoreTracker.DatasetName, r.LossFunctionName, r.FinalValue);
195+
sb.AppendFormat(CultureInfo.InvariantCulture, "{0}.{1}={2}\n", ScoreTracker.DatasetName, r.LossFunctionName, r.FinalValue);
195196
}
196197
return sb.ToString();
197198
}
@@ -377,7 +378,7 @@ public override string FormatInfoString()
377378
{
378379
if (i > 1)
379380
sb.Append("\t");
380-
sb.AppendFormat("@{0}:{1:00.00}", i++, 100.0 * t.FinalValue);
381+
sb.AppendFormat(CultureInfo.InvariantCulture, "@{0}:{1:00.00}", i++, 100.0 * t.FinalValue);
381382
}
382383
sb.AppendLine();
383384
return sb.ToString();
@@ -512,7 +513,7 @@ public override string FormatInfoString()
512513
{
513514
if (i > 1)
514515
sb.Append("\t");
515-
sb.AppendFormat("{0}:{1:00.00}", t.LossFunctionName, t.FinalValue);
516+
sb.AppendFormat(CultureInfo.InvariantCulture, "{0}:{1:00.00}", t.LossFunctionName, t.FinalValue);
516517
i++;
517518
}
518519
sb.AppendLine();

src/Microsoft.ML.FastTree/TreeEnsemble/Ensemble.cs

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
using System;
66
using System.Collections.Generic;
7+
using System.Globalization;
78
using System.IO;
89
using System.Linq;
910
using System.Text;
@@ -128,13 +129,13 @@ public string ToTreeEnsembleIni(FeaturesToContentMap fmap,
128129

129130
numNodes += evaluatorCounter;
130131

131-
sb.AppendFormat("[TreeEnsemble]\nInputs={0}\nEvaluators={1}\n", featureToID.Count, evaluatorCounter + 1);
132+
sb.AppendFormat(CultureInfo.InvariantCulture, "[TreeEnsemble]\nInputs={0}\nEvaluators={1}\n", featureToID.Count, evaluatorCounter + 1);
132133

133134
sb.Append(sbInput);
134135
sb.Append(sbEvaluator);
135136

136137
// Append the final aggregator
137-
sb.AppendFormat("\n[Evaluator:{0}]\nEvaluatorType=Aggregator\nNumNodes={1}\nNodes=", evaluatorCounter + 1, numNodes);
138+
sb.AppendFormat(CultureInfo.InvariantCulture, "\n[Evaluator:{0}]\nEvaluatorType=Aggregator\nNumNodes={1}\nNodes=", evaluatorCounter + 1, numNodes);
138139

139140
// Nodes
140141
if (_firstInputInitializationContent != null)
@@ -163,7 +164,7 @@ public string ToTreeEnsembleIni(FeaturesToContentMap fmap,
163164
{
164165
if (_firstInputInitializationContent != null)
165166
sb.Append("\t");
166-
sb.AppendFormat("{0}", _trees[0].Weight);
167+
sb.AppendFormat(CultureInfo.InvariantCulture, "{0}", _trees[0].Weight);
167168
}
168169

169170
for (int w = 1; w < NumTrees; ++w)
@@ -172,7 +173,7 @@ public string ToTreeEnsembleIni(FeaturesToContentMap fmap,
172173
{
173174
sb.Append("\t");
174175
}
175-
sb.Append(_trees[w].Weight);
176+
sb.Append(_trees[w].Weight.ToString(CultureInfo.InvariantCulture));
176177
}
177178

178179
sb.AppendFormat("\nBias={0}", Bias);
@@ -193,15 +194,15 @@ public string ToTreeEnsembleIni(FeaturesToContentMap fmap,
193194

194195
protected int AppendComments(StringBuilder sb, string trainingParams)
195196
{
196-
sb.AppendFormat("\n\n[Comments]\nC:0=Regression Tree Ensemble\nC:1=Generated using FastTree\nC:2=Created on {0}\n", DateTime.Now);
197+
sb.AppendFormat("\n\n[Comments]\nC:0=Regression Tree Ensemble\nC:1=Generated using FastTree\nC:2=Created on {0}\n", DateTime.Now.ToString(CultureInfo.InvariantCulture));
197198

198199
string[] trainingParamsList = trainingParams.Split(new char[] { '\n' });
199200
int i = 0;
200201
for (; i < trainingParamsList.Length; ++i)
201202
{
202203
if (trainingParamsList[i].Length > 0)
203204
{
204-
sb.AppendFormat("C:{0}=PARAM:{1}\n", i + 3, trainingParamsList[i]);
205+
sb.AppendFormat(CultureInfo.InvariantCulture, "C:{0}=PARAM:{1}\n", i + 3, trainingParamsList[i]);
205206
}
206207
}
207208
return i + 3;
@@ -328,15 +329,15 @@ public string ToGainSummary(FeaturesToContentMap fmap, Dictionary<int, int> feat
328329
foreach (var pair in sortedByGain)
329330
{
330331
int outputInputId = featureToID.ContainsKey(pair.Key) ? featureToID[pair.Key] : 0;
331-
output.Append(string.Format("C:{0}=FG:I{1}:{2}:{3}\n", startingCommentNumber++, outputInputId,
332-
fmap.GetName(pair.Key), Math.Pow(pair.Value, power) / normalizingFactor));
332+
output.AppendFormat(CultureInfo.InvariantCulture, "C:{0}=FG:I{1}:{2}:{3}\n", startingCommentNumber++, outputInputId,
333+
fmap.GetName(pair.Key), Math.Pow(pair.Value, power) / normalizingFactor);
333334
}
334335
return output.ToString();
335336
}
336337

337338
/// <summary>
338339
/// Returns a vector of feature contributions for a given example.
339-
/// <paramref name="builder"/> is used as a buffer to accumulate the contributions across trees.
340+
/// <paramref name="builder"/> is used as a buffer to accumulate the contributions across trees.
340341
/// If <paramref name="builder"/> is null, it will be created, otherwise it will be reused.
341342
/// </summary>
342343
internal void GetFeatureContributions(ref VBuffer<float> features, ref VBuffer<float> contribs, ref BufferBuilder<float> builder)

src/Microsoft.ML.FastTree/TreeEnsemble/RegressionTree.cs

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,10 @@
44

55
using Float = System.Single;
66

7+
using Newtonsoft.Json.Linq;
78
using System;
89
using System.Collections.Generic;
10+
using System.Globalization;
911
using System.IO;
1012
using System.Linq;
1113
using System.Text;
@@ -18,7 +20,7 @@
1820
using LotusvNext.Expressions;*/
1921
using Microsoft.ML.Runtime.Model.Pfa;
2022
using Microsoft.ML.Runtime.Internal.Internallearn;
21-
using Newtonsoft.Json.Linq;
23+
2224

2325
namespace Microsoft.ML.Runtime.FastTree.Internal
2426
{
@@ -45,12 +47,12 @@ public class RegressionTree
4547
/// </summary>
4648
public bool[] CategoricalSplit { get; }
4749
/// <summary>
48-
/// Array of categorical values for the categorical feature that might be chosen as
50+
/// Array of categorical values for the categorical feature that might be chosen as
4951
/// a split feature for a node.
5052
/// </summary>
5153
public int[][] CategoricalSplitFeatures;
5254
/// <summary>
53-
/// For a given categorical feature that is chosen as a split feature for a node, this
55+
/// For a given categorical feature that is chosen as a split feature for a node, this
5456
/// array contains it's start and end range in the input feature vector at prediction time.
5557
/// </summary>
5658
public int[][] CategoricalSplitFeatureRanges;
@@ -1184,7 +1186,7 @@ public void ToTreeEnsembleFormat(StringBuilder sbEvaluator, StringBuilder sbInpu
11841186
private void ToTreeEnsembleFormatForCategoricalSplit(StringBuilder sbEvaluator, StringBuilder sbInput, FeaturesToContentMap featureContents,
11851187
ref int evaluatorCounter, Dictionary<int, int> featureToId, Dictionary<int, int> categoricalSplitNodeToId)
11861188
{
1187-
//REVIEW: Can all these conditions even be true?
1189+
//REVIEW: Can all these conditions even be true?
11881190
if (CategoricalSplitFeatures == null ||
11891191
CategoricalSplitFeatures.Length == 0 ||
11901192
CategoricalSplitFeatures.All(val => val == null))
@@ -1234,7 +1236,7 @@ private void ToTreeEnsembleFormatForCategoricalSplit(StringBuilder sbEvaluator,
12341236
sbLteChild.Append((n + 1) + toAppend);
12351237
sbGtChild.Append(~n + toAppend);
12361238
sbOutput.Append("1\t");
1237-
sbThreshold.Append(((double)0.5).ToString("R") + toAppend);
1239+
sbThreshold.Append(((double)0.5).ToString("R", CultureInfo.InvariantCulture) + toAppend);
12381240
}
12391241

12401242
sbOutput.Append("0");
@@ -1266,12 +1268,12 @@ public string ToOldIni(FeatureNameCollection featureNames)
12661268
if (gtChildCorrected < 0)
12671269
gtChildCorrected = numNonLeaves + (~gtChildCorrected);
12681270

1269-
output.AppendFormat("\nNodeType:{0}=Branch\nNodeDecision:{0}={1}\nNodeThreshold:{0}={2}\nNodeLTE:{0}={3}\nNodeGT:{0}={4}\n", n, name, currentThreshold, lteChildCorrected, gtChildCorrected);
1271+
output.AppendFormat(CultureInfo.InvariantCulture, "\nNodeType:{0}=Branch\nNodeDecision:{0}={1}\nNodeThreshold:{0}={2}\nNodeLTE:{0}={3}\nNodeGT:{0}={4}\n", n, name, currentThreshold, lteChildCorrected, gtChildCorrected);
12701272
}
12711273

12721274
for (int n = 0; n < NumLeaves; ++n)
12731275
{
1274-
output.AppendFormat("\nNodeType:{0}=Value\nNodeValue:{0}={1}\n", numNonLeaves + n, LeafValues[n]);
1276+
output.AppendFormat(CultureInfo.InvariantCulture, "\nNodeType:{0}=Value\nNodeValue:{0}={1}\n", numNonLeaves + n, LeafValues[n]);
12751277
}
12761278

12771279
return output.ToString();
@@ -1552,7 +1554,7 @@ public void AppendFeatureContributions(ref VBuffer<Float> src, BufferBuilder<Flo
15521554
var ghostLeaf = GetLeafFrom(ref src, otherWay);
15531555
var ghostOutput = GetOutput(ghostLeaf);
15541556

1555-
// If the ghost got a smaller output, the contribution of the feature is positive, so
1557+
// If the ghost got a smaller output, the contribution of the feature is positive, so
15561558
// the contribution is true minus ghost.
15571559
contributions.AddFeature(ifeat, (Float)(trueOutput - ghostOutput));
15581560
}

src/Microsoft.ML.FastTree/Utils/Timer.cs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
// See the LICENSE file in the project root for more information.
44

55
using System;
6+
using System.Globalization;
67
using System.Text;
78
using System.Threading;
89

@@ -156,7 +157,7 @@ public override string ToString()
156157

157158
string padded = "Name".PadRight(MaxEventNameLen);
158159

159-
sb.AppendFormat("{0} {1,10}{2,10}{3,8}{4,11}\n", padded, "Time", "%", "#Calls", "Time/Call");
160+
sb.AppendFormat(CultureInfo.InvariantCulture, "{0} {1,10}{2,10}{3,8}{4,11}\n", padded, "Time", "%", "#Calls", "Time/Call");
160161
foreach (TimerEvent n in Enum.GetValues(typeof(TimerEvent)))
161162
{
162163
double time = (double)TickTotals[(int)n] / Stopwatch.Frequency;
@@ -167,7 +168,7 @@ public override string ToString()
167168

168169
padded = n.ToString().PadRight(MaxEventNameLen);
169170

170-
sb.AppendFormat("{0} {1,10:0.000}{2,9:00.00}%{3,8}{4,11:0.000}\n", padded, time, perc, numCalls, timePerCall);
171+
sb.AppendFormat(CultureInfo.InvariantCulture, "{0} {1,10:0.000}{2,9:00.00}%{3,8}{4,11:0.000}\n", padded, time, perc, numCalls, timePerCall);
171172
}
172173
sb.AppendFormat("Count Statistics:\n");
173174
padded = "Name".PadRight(MaxEventNameLen);
@@ -178,7 +179,7 @@ public override string ToString()
178179

179180
padded = n.ToString().PadRight(MaxEventNameLen);
180181

181-
sb.AppendFormat("{0} {1,10}\n", padded, count);
182+
sb.AppendFormat(CultureInfo.InvariantCulture, "{0} {1,10}\n", padded, count);
182183
}
183184
return sb.ToString();
184185
}

src/Microsoft.ML.FastTree/Utils/VectorUtils.cs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
// See the LICENSE file in the project root for more information.
44

55
using System;
6+
using System.Globalization;
67
using System.Text;
78

89
namespace Microsoft.ML.Runtime.FastTree.Internal
@@ -338,7 +339,7 @@ public static string ToString(double[] vector)
338339
{
339340
sb.Append(", ");
340341
}
341-
sb.Append(vector[f]);
342+
sb.Append(vector[f].ToString(CultureInfo.InvariantCulture));
342343
}
343344
return sb.ToString();
344345
}

0 commit comments

Comments
 (0)