Skip to content

Commit f7e6376

Browse files
authored
unit test additions (including user input validation testing); dead code removal for code coverage (including KDO & associated utils); misc fixes & revs (dotnet#22)
1 parent ab6930c commit f7e6376

10 files changed

+324
-789
lines changed

src/AutoML/ColumnInference/TextFileSample.cs

+4-2
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ public static TextFileSample CreateFromFullStream(Stream stream)
9797
return CreateFromHead(stream);
9898
}
9999
var fileSize = stream.Length;
100-
100+
101101
if (fileSize <= 2 * BufferSizeMb * (1 << 20))
102102
{
103103
return CreateFromHead(stream);
@@ -288,11 +288,13 @@ private static bool IsEncodingOkForSampling(byte[] buffer)
288288
break;
289289
}
290290
if (utf8)
291+
{
291292
return true;
293+
}
292294

293295
if (buffer.Take(sniffLim).Any(x => x == 0))
294296
{
295-
// likely a UTF-16 or UTF-32 wuthout a BOM.
297+
// likely a UTF-16 or UTF-32 without a BOM.
296298
return false;
297299
}
298300

src/AutoML/Sweepers/KdoSweeper.cs

-495
This file was deleted.

src/AutoML/Sweepers/SweeperProbabilityUtils.cs

-89
Original file line numberDiff line numberDiff line change
@@ -10,26 +10,6 @@ namespace Microsoft.ML.Auto
1010
{
1111
internal sealed class SweeperProbabilityUtils
1212
{
13-
public SweeperProbabilityUtils()
14-
{
15-
}
16-
17-
public static double Sum(double[] a)
18-
{
19-
double total = 0;
20-
foreach (double d in a)
21-
total += d;
22-
return total;
23-
}
24-
25-
public static double NormalCdf(double x, double mean, double variance)
26-
{
27-
double centered = x - mean;
28-
double ztrans = centered / (Math.Sqrt(variance) * Math.Sqrt(2));
29-
30-
return 0.5 * (1 + ProbabilityFunctions.Erf(ztrans));
31-
}
32-
3313
public static double StdNormalPdf(double x)
3414
{
3515
return 1 / Math.Sqrt(2 * Math.PI) * Math.Exp(-Math.Pow(x, 2) / 2);
@@ -63,45 +43,6 @@ public double[] NormalRVs(int numRVs, double mu, double sigma)
6343
return rvs.ToArray();
6444
}
6545

66-
/// <summary>
67-
/// This performs (slow) roulette-wheel sampling of a categorical distribution. Should be swapped for other
68-
/// method as soon as one is available.
69-
/// </summary>
70-
/// <param name="numSamples">Number of samples to draw.</param>
71-
/// <param name="weights">Weights for distribution (should sum to 1).</param>
72-
/// <returns>A set of indicies indicating which element was chosen for each sample.</returns>
73-
public int[] SampleCategoricalDistribution(int numSamples, double[] weights)
74-
{
75-
// Normalize weights if necessary.
76-
double total = Sum(weights);
77-
if (Math.Abs(1.0 - total) > 0.0001)
78-
weights = Normalize(weights);
79-
80-
// Build roulette wheel.
81-
double[] rw = new double[weights.Length];
82-
double cs = 0.0;
83-
for (int i = 0; i < weights.Length; i++)
84-
{
85-
cs += weights[i];
86-
rw[i] = cs;
87-
}
88-
89-
// Draw samples.
90-
int[] results = new int[numSamples];
91-
for (int i = 0; i < results.Length; i++)
92-
{
93-
double u = AutoMlUtils.Random.NextDouble();
94-
results[i] = BinarySearch(rw, u, 0, rw.Length - 1);
95-
}
96-
97-
return results;
98-
}
99-
100-
public double SampleUniform()
101-
{
102-
return AutoMlUtils.Random.NextDouble();
103-
}
104-
10546
/// <summary>
10647
/// Simple binary search method for finding smallest index in array where value
10748
/// meets or exceeds what you're looking for.
@@ -120,36 +61,6 @@ private int BinarySearch(double[] a, double u, int low, int high)
12061
return a[mid] >= u ? BinarySearch(a, u, low, mid) : BinarySearch(a, u, mid, high);
12162
}
12263

123-
public static double[] Normalize(double[] weights)
124-
{
125-
double total = Sum(weights);
126-
127-
// If all weights equal zero, set to 1 (to avoid divide by zero).
128-
if (total <= Double.Epsilon)
129-
{
130-
Console.WriteLine($"{total} {Double.Epsilon}");
131-
for(var i = 0; i < weights.Length; i++)
132-
{
133-
weights[i] = 1;
134-
}
135-
total = weights.Length;
136-
}
137-
138-
for (int i = 0; i < weights.Length; i++)
139-
weights[i] /= total;
140-
return weights;
141-
}
142-
143-
public static double[] InverseNormalize(double[] weights)
144-
{
145-
weights = Normalize(weights);
146-
147-
for (int i = 0; i < weights.Length; i++)
148-
weights[i] = 1 - weights[i];
149-
150-
return Normalize(weights);
151-
}
152-
15364
public static Float[] ParameterSetAsFloatArray(IValueGenerator[] sweepParams, ParameterSet ps, bool expandCategoricals = true)
15465
{
15566
AutoMlUtils.Assert(ps.Count == sweepParams.Length);

src/AutoML/Utils/Conversions.cs

-59
Original file line numberDiff line numberDiff line change
@@ -16,21 +16,6 @@ namespace Microsoft.ML.Auto
1616

1717
internal static class Conversions
1818
{
19-
/// <summary>
20-
/// This produces zero for empty. It returns false if the text is not parsable or overflows.
21-
/// </summary>
22-
public static bool TryParse(in TX src, out U1 dst)
23-
{
24-
ulong res;
25-
if (!TryParse(in src, out res) || res > U1.MaxValue)
26-
{
27-
dst = 0;
28-
return false;
29-
}
30-
dst = (U1)res;
31-
return true;
32-
}
33-
3419
/// <summary>
3520
/// This produces zero for empty. It returns false if the text is not parsable.
3621
/// On failure, it sets dst to the NA value.
@@ -207,49 +192,5 @@ public static bool TryParse(in TX src, out BL dst)
207192
dst = false;
208193
return false;
209194
}
210-
211-
/// <summary>
212-
/// This produces zero for empty. It returns false if the text is not parsable or overflows.
213-
/// </summary>
214-
public static bool TryParse(in TX src, out U8 dst)
215-
{
216-
if (src.IsEmpty)
217-
{
218-
dst = 0;
219-
return false;
220-
}
221-
222-
return TryParseCore(src.Span, out dst);
223-
}
224-
225-
private static bool TryParseCore(ReadOnlySpan<char> span, out ulong dst)
226-
{
227-
ulong res = 0;
228-
int ich = 0;
229-
while (ich < span.Length)
230-
{
231-
uint d = (uint)span[ich++] - (uint)'0';
232-
if (d >= 10)
233-
goto LFail;
234-
235-
// If any of the top three bits of prev are set, we're guaranteed to overflow.
236-
if ((res & 0xE000000000000000UL) != 0)
237-
goto LFail;
238-
239-
// Given that tmp = 8 * res doesn't overflow, if 10 * res + d overflows, then it overflows to
240-
// 10 * res + d - 2^n = tmp + (2 * res + d - 2^n). Clearly the paren group is negative,
241-
// so the new result (after overflow) will be less than tmp. The converse is also true.
242-
ulong tmp = res << 3;
243-
res = tmp + (res << 1) + d;
244-
if (res < tmp)
245-
goto LFail;
246-
}
247-
dst = res;
248-
return true;
249-
250-
LFail:
251-
dst = 0;
252-
return false;
253-
}
254195
}
255196
}

src/AutoML/Utils/Stats.cs

-83
This file was deleted.

0 commit comments

Comments
 (0)