Skip to content

Commit 5e95fd9

Browse files
daholsteDmitry-A
authored andcommitted
make params for MLContext data extensions match ML.NET default names and values; update gitignore; nit rev for Benchmarking.cs (dotnet#5)
1 parent e654cd3 commit 5e95fd9

File tree

4 files changed

+44
-82
lines changed

4 files changed

+44
-82
lines changed

src/AutoML/API/MLContextExtensions.cs

Lines changed: 15 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -176,39 +176,23 @@ public static class DataExtensions
176176
{
177177
// Delimiter, header, column datatype inference
178178
public static ColumnInferenceResult InferColumns(this DataOperations catalog, string path, string label,
179-
bool hasHeader = false, string separator = null, bool? isQuoted = null, bool? isSparse = null)
179+
bool hasHeader = false, char? separatorChar = null, bool? allowQuotedStrings = null, bool? supportSparse = null, bool trimWhitespace = false)
180180
{
181181
UserInputValidationUtil.ValidateInferColumnsArgs(path, label);
182182
var mlContext = new MLContext();
183-
return ColumnInferenceApi.InferColumns(mlContext, path, label, hasHeader, separator, isQuoted, isSparse);
184-
}
185-
186-
// Auto reader (includes column inference)
187-
public static IDataView AutoRead(this DataOperations catalog, Stream stream)
188-
{
189-
throw new NotImplementedException();
183+
return ColumnInferenceApi.InferColumns(mlContext, path, label, hasHeader, separatorChar, allowQuotedStrings, supportSparse, trimWhitespace);
190184
}
191185

192186
public static IDataView AutoRead(this DataOperations catalog, string path, string label,
193-
bool hasHeader = false, string separator = null, bool? isQuoted = null, bool? isSparse = null)
187+
bool hasHeader = false, char? separatorChar = null, bool? allowQuotedStrings = null, bool? supportSparse = null, bool trimWhitespace = false)
194188
{
195189
UserInputValidationUtil.ValidateAutoReadArgs(path, label);
196190
var mlContext = new MLContext();
197-
var columnInferenceResult = ColumnInferenceApi.InferColumns(mlContext, path, label, hasHeader, separator, isQuoted, isSparse);
191+
var columnInferenceResult = ColumnInferenceApi.InferColumns(mlContext, path, label, hasHeader, separatorChar, allowQuotedStrings, supportSparse, trimWhitespace);
198192
var textLoader = columnInferenceResult.BuildTextLoader();
199193
return textLoader.Read(path);
200194
}
201195

202-
public static IDataView AutoRead(this DataOperations catalog, IMultiStreamSource source, string label,
203-
bool hasHeader = false, string separator = null, bool? isQuoted = null, bool? isSparse = null)
204-
{
205-
UserInputValidationUtil.ValidateAutoReadArgs(source, label);
206-
var mlContext = new MLContext();
207-
var columnInferenceResult = ColumnInferenceApi.InferColumns(mlContext, source, label, hasHeader, separator, isQuoted, isSparse);
208-
var textLoader = columnInferenceResult.BuildTextLoader();
209-
return textLoader.Read(source);
210-
}
211-
212196
public static TextLoader CreateTextReader(this DataOperations catalog, ColumnInferenceResult columnInferenceResult)
213197
{
214198
UserInputValidationUtil.ValidateCreateTextReaderArgs(columnInferenceResult);
@@ -232,30 +216,33 @@ public enum MachineLearningTaskType
232216
public class ColumnInferenceResult
233217
{
234218
public readonly IEnumerable<(TextLoader.Column, ColumnPurpose)> Columns;
235-
public readonly bool IsQuoted;
236-
public readonly bool IsSparse;
219+
public readonly bool AllowQuotedStrings;
220+
public readonly bool SupportSparse;
237221
public readonly string Separator;
238222
public readonly bool HasHeader;
223+
public readonly bool TrimWhitespace;
239224

240225
public ColumnInferenceResult(IEnumerable<(TextLoader.Column, ColumnPurpose)> columns,
241-
bool isQuoted, bool isSparse, string separator, bool hasHeader)
226+
bool allowQuotedStrings, bool supportSparse, string separator, bool hasHeader, bool trimWhitespace)
242227
{
243228
Columns = columns;
244-
IsQuoted = isQuoted;
245-
IsSparse = isSparse;
229+
AllowQuotedStrings = allowQuotedStrings;
230+
SupportSparse = supportSparse;
246231
Separator = separator;
247232
HasHeader = hasHeader;
233+
TrimWhitespace = trimWhitespace;
248234
}
249235

250236
internal TextLoader BuildTextLoader()
251237
{
252238
var context = new MLContext();
253239
return new TextLoader(context, new TextLoader.Arguments() {
254-
AllowQuoting = IsQuoted,
255-
AllowSparse = IsSparse,
240+
AllowQuoting = AllowQuotedStrings,
241+
AllowSparse = SupportSparse,
256242
Column = Columns.Select(c => c.Item1).ToArray(),
257243
Separator = Separator,
258-
HasHeader = HasHeader
244+
HasHeader = HasHeader,
245+
TrimWhitespace = TrimWhitespace
259246
});
260247
}
261248
}

src/AutoML/ColumnInference/ColumnInferenceApi.cs

Lines changed: 25 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -7,44 +7,47 @@ namespace Microsoft.ML.Auto
77
internal static class ColumnInferenceApi
88
{
99
public static ColumnInferenceResult InferColumns(MLContext context, string path, string label,
10-
bool hasHeader, string separator, bool? isQuoted, bool? isSparse)
10+
bool hasHeader, char? separatorChar, bool? allowQuotedStrings, bool? supportSparse, bool trimWhitespace)
1111
{
1212
var sample = TextFileSample.CreateFromFullFile(path);
13-
Func<TextLoader, IDataView> createDataView = (textLoader) =>
13+
var splitInference = InferSplit(sample, separatorChar, allowQuotedStrings, supportSparse);
14+
var typeInference = InferColumnTypes(context, sample, splitInference);
15+
var typedLoaderArgs = new TextLoader.Arguments
1416
{
15-
return textLoader.Read(path);
17+
Column = ColumnTypeInference.GenerateLoaderColumns(typeInference.Columns),
18+
Separator = splitInference.Separator,
19+
AllowSparse = splitInference.AllowSparse,
20+
AllowQuoting = splitInference.AllowQuote,
21+
HasHeader = hasHeader,
22+
TrimWhitespace = trimWhitespace
1623
};
17-
return InferColumns(context, sample, createDataView, label, hasHeader, separator, isQuoted, isSparse);
18-
}
24+
var textLoader = context.Data.CreateTextReader(typedLoaderArgs);
25+
var dataView = textLoader.Read(path);
1926

20-
public static ColumnInferenceResult InferColumns(MLContext context, IMultiStreamSource multiStreamSource,
21-
string label, bool hasHeader, string separator, bool? isQuoted, bool? isSparse)
22-
{
23-
// heuristic: use first stream in multi-stream source to infer column types & split
24-
var stream = multiStreamSource.Open(0);
25-
var sample = TextFileSample.CreateFromFullStream(stream);
27+
var purposeInferenceResult = PurposeInference.InferPurposes(context, dataView, label);
2628

27-
Func<TextLoader, IDataView> createDataView = (textLoader) =>
28-
{
29-
return textLoader.Read(multiStreamSource);
30-
};
29+
// infer column grouping and generate column names
30+
var groupingResult = ColumnGroupingInference.InferGroupingAndNames(context, hasHeader,
31+
typeInference.Columns, purposeInferenceResult);
3132

32-
return InferColumns(context, sample, createDataView, label, hasHeader, separator, isQuoted, isSparse);
33+
// build result objects & return
34+
var inferredColumns = groupingResult.Select(c => (c.GenerateTextLoaderColumn(), c.Purpose)).ToArray();
35+
return new ColumnInferenceResult(inferredColumns, splitInference.AllowQuote, splitInference.AllowSparse, splitInference.Separator, hasHeader, trimWhitespace);
3336
}
3437

35-
private static TextFileContents.ColumnSplitResult InferSplit(TextFileSample sample, string separator, bool? isQuoted, bool? isSparse)
38+
private static TextFileContents.ColumnSplitResult InferSplit(TextFileSample sample, char? separatorChar, bool? allowQuotedStrings, bool? supportSparse)
3639
{
37-
var separatorCandidates = separator == null ? TextFileContents.DefaultSeparators : new string[] { separator };
40+
var separatorCandidates = separatorChar == null ? TextFileContents.DefaultSeparators : new char[] { separatorChar.Value };
3841
var splitInference = TextFileContents.TrySplitColumns(sample, separatorCandidates);
3942

4043
// respect passed-in overrides
41-
if(isQuoted != null)
44+
if(allowQuotedStrings != null)
4245
{
43-
splitInference.AllowQuote = isQuoted.Value;
46+
splitInference.AllowQuote = allowQuotedStrings.Value;
4447
}
45-
if(isSparse != null)
48+
if(supportSparse != null)
4649
{
47-
splitInference.AllowSparse = isSparse.Value;
50+
splitInference.AllowSparse = supportSparse.Value;
4851
}
4952

5053
if (!splitInference.IsSuccess)
@@ -75,33 +78,5 @@ private static ColumnTypeInference.InferenceResult InferColumnTypes(MLContext co
7578

7679
return typeInferenceResult;
7780
}
78-
79-
private static ColumnInferenceResult InferColumns(MLContext context,
80-
TextFileSample sample, Func<TextLoader, IDataView> createDataView, string label,
81-
bool hasHeader, string separator, bool? isQuoted, bool? isSparse)
82-
{
83-
var splitInference = InferSplit(sample, separator, isQuoted, isSparse);
84-
var typeInference = InferColumnTypes(context, sample, splitInference);
85-
var typedLoaderArgs = new TextLoader.Arguments
86-
{
87-
Column = ColumnTypeInference.GenerateLoaderColumns(typeInference.Columns),
88-
Separator = splitInference.Separator,
89-
AllowSparse = splitInference.AllowSparse,
90-
AllowQuoting = splitInference.AllowQuote,
91-
HasHeader = hasHeader
92-
};
93-
var textLoader = context.Data.CreateTextReader(typedLoaderArgs);
94-
var dataView = createDataView(textLoader);
95-
96-
var purposeInferenceResult = PurposeInference.InferPurposes(context, dataView, label);
97-
98-
// infer column grouping and generate column names
99-
var groupingResult = ColumnGroupingInference.InferGroupingAndNames(context, hasHeader,
100-
typeInference.Columns, purposeInferenceResult);
101-
102-
// build result objects & return
103-
var inferredColumns = groupingResult.Select(c => (c.GenerateTextLoaderColumn(), c.Purpose)).ToArray();
104-
return new ColumnInferenceResult(inferredColumns, splitInference.AllowQuote, splitInference.AllowSparse, splitInference.Separator, hasHeader);
105-
}
10681
}
10782
}

src/AutoML/ColumnInference/TextFileContents.cs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ public ColumnSplitResult(bool isSuccess, string separator, bool allowQuote, bool
3737
// If the fraction of lines having the same number of columns exceeds this, we consider the column count to be known.
3838
private const Double UniformColumnCountThreshold = 0.98;
3939

40-
public static string[] DefaultSeparators = new[] { "tab", ",", ";", " " };
40+
public static char[] DefaultSeparators = new[] { '\t', ',', ';', ' ' };
4141

4242
/// <summary>
4343
/// Attempt to detect text loader arguments.
@@ -46,7 +46,7 @@ public ColumnSplitResult(bool isSuccess, string separator, bool allowQuote, bool
4646
/// and this number of columns is more than 1.
4747
/// We sweep on separator, allow sparse and allow quote parameter.
4848
/// </summary>
49-
public static ColumnSplitResult TrySplitColumns(IMultiStreamSource source, string[] separatorCandidates)
49+
public static ColumnSplitResult TrySplitColumns(IMultiStreamSource source, char[] separatorCandidates)
5050
{
5151
var sparse = new[] { true, false };
5252
var quote = new[] { true, false };
@@ -60,7 +60,7 @@ from _sep in separatorCandidates
6060
var args = new TextLoader.Arguments
6161
{
6262
Column = new[] { TextLoader.Column.Parse("C:TX:0-**") },
63-
Separator = perm._sep,
63+
Separator = perm._sep.ToString(),
6464
AllowQuoting = perm._allowQuote,
6565
AllowSparse = perm._allowSparse
6666
};

src/Samples/Benchmarking.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ public static void Run()
2727
{
2828
StoppingCriteria = new ExperimentStoppingCriteria()
2929
{
30-
MaxIterations = 200,
30+
MaxIterations = 5,
3131
TimeOutInMinutes = 1000000000
3232
}
3333
});

0 commit comments

Comments
 (0)