Skip to content

Commit f04cd68

Browse files
committed
kept overload with Arguments and removed advanced settings
1 parent 2986558 commit f04cd68

25 files changed

+101
-66
lines changed

src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs

Lines changed: 23 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1013,23 +1013,27 @@ private bool HasHeader
10131013
/// <param name="env">The environment to use.</param>
10141014
/// <param name="columns">Defines a mapping between input columns in the file and IDataView columns.</param>
10151015
/// <param name="hasHeader">Whether the file has a header.</param>
1016-
/// <param name="separatorChars">Defines the characters used as separators between data points in a row. By default the tab character is taken as separator.</param>
1017-
/// <param name="advancedSettings">A delegate to apply all the advanced arguments to the algorithm.</param>
1016+
/// <param name="separatorChar"> The character used as separator between data points in a row. By default the tab character is used as separator.</param>
10181017
/// <param name="dataSample">Allows to expose items that can be used for reading.</param>
1019-
public TextLoader(IHostEnvironment env, Column[] columns, bool hasHeader = false, char[] separatorChars = null, Action<Arguments> advancedSettings = null, IMultiStreamSource dataSample = null)
1020-
: this(env, MakeArgs(columns, hasHeader, separatorChars, advancedSettings), dataSample)
1018+
public TextLoader(IHostEnvironment env, Column[] columns, bool hasHeader = false, char separatorChar = '\t', IMultiStreamSource dataSample = null)
1019+
: this(env, MakeArgs(columns, hasHeader, new[] { separatorChar }), dataSample)
10211020
{
10221021
}
10231022

1024-
private static Arguments MakeArgs(Column[] columns, bool hasHeader, char[] separatorChars, Action<Arguments> advancedSettings)
1023+
private static Arguments MakeArgs(Column[] columns, bool hasHeader, char[] separatorChars)
10251024
{
10261025
separatorChars = separatorChars ?? new[] { '\t' };
10271026
var result = new Arguments { Column = columns, HasHeader = hasHeader, SeparatorChars = separatorChars};
1028-
advancedSettings?.Invoke(result);
10291027
return result;
10301028
}
10311029

1032-
internal TextLoader(IHostEnvironment env, Arguments args, IMultiStreamSource dataSample = null)
1030+
/// <summary>
1031+
/// Loads a text file into an <see cref="IDataView"/>. Supports basic mapping from input columns to IDataView columns.
1032+
/// </summary>
1033+
/// <param name="env">The environment to use.</param>
1034+
/// <param name="args">Defines the settings of the load operation.</param>
1035+
/// <param name="dataSample">Allows to expose items that can be used for reading.</param>
1036+
public TextLoader(IHostEnvironment env, Arguments args, IMultiStreamSource dataSample = null)
10331037
{
10341038
Contracts.CheckValue(env, nameof(env));
10351039
_host = env.Register(RegistrationName);
@@ -1317,11 +1321,19 @@ internal static IDataLoader Create(IHostEnvironment env, Arguments args, IMultiS
13171321
/// <param name="env">The environment to use.</param>
13181322
/// <param name="columns">Defines a mapping between input columns in the file and IDataView columns.</param>
13191323
/// <param name="hasHeader">Whether the file has a header.</param>
1320-
/// <param name="separatorChars">Defines the characters used as separators between data points in a row. By default the tab character is taken as separator.</param>
1321-
/// <param name="advancedSettings">A delegate to apply all the advanced arguments to the algorithm.</param>
1324+
/// <param name="separatorChar"> The character used as separator between data points in a row. By default the tab character is used as separator.</param>
1325+
/// <param name="fileSource">Specifies a file from which to read.</param>
1326+
public static IDataView ReadFile(IHostEnvironment env, IMultiStreamSource fileSource, Column[] columns, bool hasHeader = false, char separatorChar = '\t')
1327+
=> new TextLoader(env, columns, hasHeader, separatorChar, fileSource).Read(fileSource);
1328+
1329+
/// <summary>
1330+
/// Loads a text file into an <see cref="IDataView"/>. Supports basic mapping from input columns to IDataView columns.
1331+
/// </summary>
1332+
/// <param name="env">The environment to use.</param>
13221333
/// <param name="fileSource">Specifies a file from which to read.</param>
1323-
public static IDataView ReadFile(IHostEnvironment env, IMultiStreamSource fileSource, Column[] columns, bool hasHeader = false, char[] separatorChars = null, Action<Arguments> advancedSettings = null)
1324-
=> new TextLoader(env, columns, hasHeader, separatorChars, advancedSettings, fileSource).Read(fileSource);
1334+
/// <param name="args">Defines the settings of the load operation.</param>
1335+
public static IDataView ReadFile(IHostEnvironment env, IMultiStreamSource fileSource, Arguments args)
1336+
=> new TextLoader(env, args, fileSource).Read(fileSource);
13251337

13261338
public void Save(ModelSaveContext ctx)
13271339
{

src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs

Lines changed: 34 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,38 +14,63 @@ namespace Microsoft.ML
1414
public static class TextLoaderSaverCatalog
1515
{
1616
/// <summary>
17-
/// Create a text reader.
17+
/// Create a text reader <see cref="TextLoader"/>.
1818
/// </summary>
1919
/// <param name="catalog">The catalog.</param>
2020
/// <param name="columns">The columns of the schema.</param>
2121
/// <param name="hasHeader">Whether the file has a header.</param>
22-
/// <param name="separatorChars">Defines the characters used as separators between data points in a row. By default the tab character is taken as separator.</param>
23-
/// <param name="advancedSettings">The delegate to set additional settings.</param>
22+
/// <param name="separatorChar"> The character used as separator between data points in a row. By default the tab character is used as separator.</param>
2423
/// <param name="dataSample">The optional location of a data sample.</param>
2524
public static TextLoader TextReader(this DataOperations catalog,
26-
Column[] columns, bool hasHeader = false, char[] separatorChars = null, Action<Arguments> advancedSettings = null, IMultiStreamSource dataSample = null)
27-
=> new TextLoader(CatalogUtils.GetEnvironment(catalog), columns, hasHeader, separatorChars, advancedSettings, dataSample);
25+
Column[] columns, bool hasHeader = false, char separatorChar = '\t', IMultiStreamSource dataSample = null)
26+
=> new TextLoader(CatalogUtils.GetEnvironment(catalog), columns, hasHeader, separatorChar, dataSample);
27+
28+
/// <summary>
29+
/// Create a text reader <see cref="TextLoader"/>.
30+
/// </summary>
31+
/// <param name="catalog">The catalog.</param>
32+
/// <param name="args">Defines the settings of the load operation.</param>
33+
/// <param name="dataSample">Allows to expose items that can be used for reading.</param>
34+
public static TextLoader TextReader(this DataOperations catalog, Arguments args, IMultiStreamSource dataSample = null)
35+
=> new TextLoader(CatalogUtils.GetEnvironment(catalog), args, dataSample);
2836

2937
/// <summary>
3038
/// Read a data view from a text file using <see cref="TextLoader"/>.
3139
/// </summary>
3240
/// <param name="catalog">The catalog.</param>
3341
/// <param name="columns">The columns of the schema.</param>
3442
/// <param name="hasHeader">Whether the file has a header.</param>
35-
/// <param name="separatorChars">Defines the characters used as separators between data points in a row. By default the tab character is taken as separator.</param>
36-
/// <param name="advancedSettings">The delegate to set additional settings.</param>
43+
/// <param name="separatorChar"> The character used as separator between data points in a row. By default the tab character is used as separator.</param>
3744
/// <param name="path">The path to the file.</param>
3845
/// <returns>The data view.</returns>
3946
public static IDataView ReadFromTextFile(this DataOperations catalog,
40-
string path, Column[] columns, bool hasHeader = false, char[] separatorChars = null, Action<Arguments> advancedSettings = null)
47+
string path, Column[] columns, bool hasHeader = false, char separatorChar = '\t')
48+
{
49+
Contracts.CheckNonEmpty(path, nameof(path));
50+
51+
var env = catalog.GetEnvironment();
52+
53+
// REVIEW: it is almost always a mistake to have a 'trainable' text loader here.
54+
// Therefore, we are going to disallow data sample.
55+
var reader = new TextLoader(env, columns, hasHeader, separatorChar, dataSample: null);
56+
return reader.Read(new MultiFileSource(path));
57+
}
58+
59+
/// <summary>
60+
/// Read a data view from a text file using <see cref="TextLoader"/>.
61+
/// </summary>
62+
/// <param name="catalog">The catalog.</param>
63+
/// <param name="path">Specifies a file from which to read.</param>
64+
/// <param name="args">Defines the settings of the load operation.</param>
65+
public static IDataView ReadFromTextFile(this DataOperations catalog, string path, Arguments args)
4166
{
4267
Contracts.CheckNonEmpty(path, nameof(path));
4368

4469
var env = catalog.GetEnvironment();
4570

4671
// REVIEW: it is almost always a mistake to have a 'trainable' text loader here.
4772
// Therefore, we are going to disallow data sample.
48-
var reader = new TextLoader(env, columns, hasHeader, separatorChars, advancedSettings, dataSample: null);
73+
var reader = new TextLoader(env, args, dataSample: null);
4974
return reader.Read(new MultiFileSource(path));
5075
}
5176

src/Microsoft.ML.Data/Utilities/ModelFileUtils.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -283,7 +283,7 @@ public static IEnumerable<KeyValuePair<ColumnRole, string>> LoadRoleMappingsOrNu
283283
{
284284
// REVIEW: Should really validate the schema here, and consider
285285
// ignoring this stream if it isn't as expected.
286-
var loader = TextLoader.ReadFile(env, new RepositoryStreamWrapper(rep, DirTrainingInfo, RoleMappingFile), null);
286+
var loader = TextLoader.ReadFile(env, new RepositoryStreamWrapper(rep, DirTrainingInfo, RoleMappingFile), new TextLoader.Arguments());
287287

288288
using (var cursor = loader.GetRowCursor(c => true))
289289
{

test/Microsoft.ML.Benchmarks/PredictionEngineBench.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ public void SetupBreastCancerPipeline()
104104
new TextLoader.Column("Label", DataKind.BL, 0),
105105
new TextLoader.Column("Features", DataKind.R4, new[] { new TextLoader.Range(1, 9) })
106106
},
107-
false
107+
hasHeader: false
108108
);
109109

110110
IDataView data = reader.Read(_breastCancerDataPath);

test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -64,26 +64,29 @@ public void TrainSentiment()
6464
{
6565
var env = new MLContext(seed: 1);
6666
// Pipeline
67-
var loader = TextLoader.ReadFile(env, new MultiFileSource(_sentimentDataPath),
68-
columns: new[]
67+
var arguemnts = new TextLoader.Arguments()
68+
{
69+
Column = new TextLoader.Column[]
6970
{
7071
new TextLoader.Column()
7172
{
7273
Name = "Label",
73-
Source = new [] { new TextLoader.Range() { Min=0, Max=0} },
74+
Source = new[] { new TextLoader.Range() { Min = 0, Max = 0 } },
7475
Type = DataKind.Num
7576
},
7677

7778
new TextLoader.Column()
7879
{
7980
Name = "SentimentText",
80-
Source = new [] { new TextLoader.Range() { Min=1, Max=1} },
81+
Source = new[] { new TextLoader.Range() { Min = 1, Max = 1 } },
8182
Type = DataKind.Text
8283
}
8384
},
84-
hasHeader: true,
85-
advancedSettings: s => { s.AllowQuoting = false; s.AllowSparse = false; }
86-
);
85+
HasHeader = true,
86+
AllowQuoting = false,
87+
AllowSparse = false
88+
};
89+
var loader = TextLoader.ReadFile(env, new MultiFileSource(_sentimentDataPath), arguemnts);
8790

8891
var text = TextFeaturizingEstimator.Create(env,
8992
new TextFeaturizingEstimator.Arguments()

test/Microsoft.ML.Predictor.Tests/TestPredictors.cs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -606,7 +606,7 @@ public void RankingLightGBMTest()
606606
public void TestTreeEnsembleCombiner()
607607
{
608608
var dataPath = GetDataPath("breast-cancer.txt");
609-
var dataView = TextLoader.ReadFile(Env, new MultiFileSource(dataPath), null);
609+
var dataView = TextLoader.ReadFile(Env, new MultiFileSource(dataPath), new TextLoader.Arguments());
610610

611611
var fastTrees = new IPredictorModel[3];
612612
for (int i = 0; i < 3; i++)
@@ -628,7 +628,7 @@ public void TestTreeEnsembleCombiner()
628628
public void TestTreeEnsembleCombinerWithCategoricalSplits()
629629
{
630630
var dataPath = GetDataPath("adult.tiny.with-schema.txt");
631-
var dataView = TextLoader.ReadFile(Env, new MultiFileSource(dataPath), null);
631+
var dataView = TextLoader.ReadFile(Env, new MultiFileSource(dataPath), new TextLoader.Arguments());
632632

633633
var cat = new OneHotEncodingEstimator(Env, "Categories", "Features").Fit(dataView).Transform(dataView);
634634
var fastTrees = new IPredictorModel[3];
@@ -729,7 +729,7 @@ private void CombineAndTestTreeEnsembles(IDataView idv, IPredictorModel[] fastTr
729729
public void TestEnsembleCombiner()
730730
{
731731
var dataPath = GetDataPath("breast-cancer.txt");
732-
var dataView = TextLoader.ReadFile(Env, new MultiFileSource(dataPath), null);
732+
var dataView = TextLoader.ReadFile(Env, new MultiFileSource(dataPath), new TextLoader.Arguments());
733733

734734
var predictors = new IPredictorModel[]
735735
{
@@ -775,7 +775,7 @@ public void TestEnsembleCombiner()
775775
public void TestMultiClassEnsembleCombiner()
776776
{
777777
var dataPath = GetDataPath("breast-cancer.txt");
778-
var dataView = TextLoader.ReadFile(Env, new MultiFileSource(dataPath), null);
778+
var dataView = TextLoader.ReadFile(Env, new MultiFileSource(dataPath), new TextLoader.Arguments());
779779

780780
var predictors = new IPredictorModel[]
781781
{

test/Microsoft.ML.TestFramework/DataPipe/TestDataPipeBase.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -438,7 +438,7 @@ protected bool SaveLoadText(IDataView view, IHostEnvironment env,
438438

439439
// Note that we don't pass in "args", but pass in a default args so we test
440440
// the auto-schema parsing.
441-
var loadedData = TextLoader.ReadFile(env, new MultiFileSource(pathData), null);
441+
var loadedData = TextLoader.ReadFile(env, new MultiFileSource(pathData), new TextLoader.Arguments());
442442
if (!CheckMetadataTypes(loadedData.Schema))
443443
Failed();
444444

test/Microsoft.ML.TestFramework/ModelHelper.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ public static IDataView GetKcHouseDataView(string dataPath)
6767
new Runtime.Data.TextLoader.Column("SqftLot15", Runtime.Data.DataKind.R4, 20)
6868
},
6969
hasHeader: true,
70-
separatorChars: new[] { ',' }
70+
separatorChar: ','
7171
);
7272
}
7373

test/Microsoft.ML.Tests/Scenarios/Api/ApiScenariosTests.cs

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -64,11 +64,6 @@ private static TextLoader.Column[] MakeIrisColumns()
6464
};
6565
}
6666

67-
private static char[] MakeIrisSeparator()
68-
{
69-
return new[] { ',' };
70-
}
71-
7267
private static TextLoader.Column[] MakeSentimentColumns()
7368
{
7469
return new[]

test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -99,9 +99,9 @@ private void TrainRegression(string trainDataPath, string testDataPath, string m
9999
new TextLoader.Column("Target", DataKind.R4, 11),
100100
},
101101
// First line of the file is a header, not a data row.
102-
true,
102+
hasHeader: true,
103103
// Default separator is tab, but we need a semicolon.
104-
new[] { ';' }
104+
separatorChar: ';'
105105
);
106106

107107
// Now read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed).
@@ -176,7 +176,7 @@ private ITransformer TrainOnIris(string irisDataPath)
176176
new TextLoader.Column("Label", DataKind.TX, 4),
177177
},
178178
// Default separator is tab, but the dataset has comma.
179-
separatorChars: new[] { ',' }
179+
separatorChar: ','
180180
);
181181

182182
// Retrieve the training data.
@@ -241,7 +241,7 @@ private void NormalizationWorkout(string dataPath)
241241
new TextLoader.Column("Label", DataKind.TX, 4),
242242
},
243243
// Default separator is tab, but the dataset has comma.
244-
separatorChars: new[] { ',' }
244+
separatorChar: ','
245245
);
246246

247247
// Read the training data.
@@ -300,7 +300,7 @@ private void TextFeaturizationOn(string dataPath)
300300
new TextLoader.Column("IsToxic", DataKind.BL, 0),
301301
new TextLoader.Column("Message", DataKind.TX, 1),
302302
},
303-
true
303+
hasHeader: true
304304
);
305305

306306
// Read the data.
@@ -372,7 +372,7 @@ private void CategoricalFeaturizationOn(params string[] dataPath)
372372
// Let's also separately load the 'Workclass' column.
373373
new TextLoader.Column("Workclass", DataKind.TX, 1),
374374
},
375-
true
375+
hasHeader: true
376376
);
377377

378378
// Read the data.
@@ -437,7 +437,7 @@ private void CrossValidationOn(string dataPath)
437437
new TextLoader.Column("Label", DataKind.TX, 4),
438438
},
439439
// Default separator is tab, but the dataset has comma.
440-
separatorChars: new[] { ',' }
440+
separatorChar: ','
441441
);
442442

443443
// Read the data.
@@ -494,7 +494,7 @@ private void ReadDataDynamic(string dataPath)
494494
new TextLoader.Column("Target", DataKind.R4, 10)
495495
},
496496
// Default separator is tab, but we need a comma.
497-
separatorChars: new[] { ',' });
497+
separatorChar: ',' );
498498

499499
// Now read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed).
500500
var data = reader.Read(dataPath);
@@ -520,7 +520,7 @@ public void CustomTransformer()
520520
{
521521
new TextLoader.Column("Income", DataKind.R4, 10),
522522
new TextLoader.Column("Features", DataKind.R4, 12, 14)
523-
}, true);
523+
}, hasHeader: true);
524524

525525
PrepareData(mlContext, data);
526526
TrainModel(mlContext, data);

test/Microsoft.ML.Tests/Scenarios/Api/Estimators/CrossValidation.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ void New_CrossValidation()
2727
{
2828
var ml = new MLContext(seed: 1, conc: 1);
2929

30-
var data = ml.Data.TextReader(MakeSentimentColumns(), true).Read(GetDataPath(TestDatasets.Sentiment.trainFilename));
30+
var data = ml.Data.TextReader(MakeSentimentColumns(), hasHeader: true).Read(GetDataPath(TestDatasets.Sentiment.trainFilename));
3131
// Pipeline.
3232
var pipeline = ml.Transforms.Text.FeaturizeText("SentimentText", "Features")
3333
.Append(ml.BinaryClassification.Trainers.StochasticDualCoordinateAscent("Label", "Features", advancedSettings: (s) => { s.ConvergenceTolerance = 1f; s.NumThreads = 1; }));

0 commit comments

Comments
 (0)