From 8ce91db96316ea6f3cc63f47806553b34c238fb2 Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Mon, 25 Feb 2019 10:03:21 -0800 Subject: [PATCH 1/4] Make text loaders consistent --- .../DataLoadSave/Text/TextLoader.cs | 14 +++--- .../Text/TextLoaderSaverCatalog.cs | 49 ++++++++++++------- 2 files changed, 38 insertions(+), 25 deletions(-) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs index 23183aab9b..872dd0d2a4 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs @@ -1095,17 +1095,19 @@ private bool HasHeader /// Whether the file can contain numerical vectors in sparse format. /// Whether the content of a column can be parsed from a string starting and ending with quote. /// Allows to expose items that can be used for reading. + /// Remove trailing whitespace from lines. internal TextLoader(IHostEnvironment env, Column[] columns, char separatorChar = Defaults.Separator, bool hasHeader = Defaults.HasHeader, bool allowSparse = Defaults.AllowSparse, - bool allowQuoting = Defaults.AllowQuoting, IMultiStreamSource dataSample = null) - : this(env, MakeArgs(columns, hasHeader, new[] { separatorChar }, allowSparse, allowQuoting), dataSample) + bool allowQuoting = Defaults.AllowQuoting, IMultiStreamSource dataSample = null, bool trimWhitespace = Defaults.TrimWhitespace) + : this(env, MakeArgs(columns, hasHeader, new[] { separatorChar }, allowSparse, allowQuoting, trimWhitespace), dataSample) { } - private static Options MakeArgs(Column[] columns, bool hasHeader, char[] separatorChars, bool allowSparse, bool allowQuoting) + private static Options MakeArgs(Column[] columns, bool hasHeader, char[] separatorChars, bool allowSparse, bool allowQuoting, bool trimWhitespace) { Contracts.AssertValue(separatorChars); - var result = new Options { Columns = columns, HasHeader = hasHeader, Separators = separatorChars, AllowSparse = allowSparse, AllowQuoting = allowQuoting }; + var result = new Options { Columns = columns, HasHeader = hasHeader, Separators = separatorChars, + AllowSparse = allowSparse, AllowQuoting = allowQuoting, TrimWhitespace = trimWhitespace }; return result; } @@ -1462,7 +1464,7 @@ void ICanSaveModel.Save(ModelSaveContext ctx) internal static TextLoader CreateTextReader(IHostEnvironment host, bool hasHeader = Defaults.HasHeader, char separator = Defaults.Separator, - bool allowQuotedStrings = Defaults.AllowQuoting, + bool allowQuoting = Defaults.AllowQuoting, bool supportSparse = Defaults.AllowSparse, bool trimWhitespace = Defaults.TrimWhitespace) { @@ -1519,7 +1521,7 @@ internal static TextLoader CreateTextReader(IHostEnvironment host, { HasHeader = hasHeader, Separators = new[] { separator }, - AllowQuoting = allowQuotedStrings, + AllowQuoting = allowQuoting, AllowSparse = supportSparse, TrimWhitespace = trimWhitespace, Columns = columns.ToArray() diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs index feb6a8c320..ee52245d1e 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs @@ -20,6 +20,7 @@ public static class TextLoaderSaverCatalog /// Whether the file has a header. /// Whether the file can contain numerical vectors in sparse format. /// Whether the file can contain column defined by a quoted string. + /// Remove trailing whitespace from lines /// The optional location of a data sample. The sample can be used to infer column names and number of slots in each column. public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog, TextLoader.Column[] columns, @@ -27,8 +28,9 @@ public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog, bool hasHeader = TextLoader.Defaults.HasHeader, bool allowSparse = TextLoader.Defaults.AllowSparse, bool allowQuoting = TextLoader.Defaults.AllowQuoting, + bool trimWhitespace = TextLoader.Defaults.TrimWhitespace, IMultiStreamSource dataSample = null) - => new TextLoader(CatalogUtils.GetEnvironment(catalog), columns, separatorChar, hasHeader, allowSparse, allowQuoting, dataSample); + => new TextLoader(CatalogUtils.GetEnvironment(catalog), columns, separatorChar, hasHeader, allowSparse, allowQuoting, dataSample, trimWhitespace); /// /// Create a text loader . @@ -47,20 +49,20 @@ public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog, /// The catalog. /// Column separator character. Default is '\t' /// Does the file contains header? + /// Whether the input may include sparse representations for example, + /// if one of the row contains "5 2:6 4:3" that's mean there are 5 columns all zero + /// except for 3rd and 5th columns which have values 6 and 3 /// Whether the input may include quoted values, /// which can contain separator characters, colons, /// and distinguish empty values from missing values. When true, consecutive separators /// denote a missing value and an empty value is denoted by \"\". /// When false, consecutive separators denote an empty value. - /// Whether the input may include sparse representations for example, - /// if one of the row contains "5 2:6 4:3" that's mean there are 5 columns all zero - /// except for 3rd and 5th columns which have values 6 and 3 /// Remove trailing whitespace from lines public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog, char separatorChar = TextLoader.Defaults.Separator, bool hasHeader = TextLoader.Defaults.HasHeader, - bool allowQuoting = TextLoader.Defaults.AllowQuoting, bool allowSparse = TextLoader.Defaults.AllowSparse, + bool allowQuoting = TextLoader.Defaults.AllowQuoting, bool trimWhitespace = TextLoader.Defaults.TrimWhitespace) => TextLoader.CreateTextReader(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar, allowQuoting, allowSparse, trimWhitespace); @@ -68,24 +70,31 @@ public static TextLoader CreateTextLoader(this DataOperationsCatalog cat /// Read a data view from a text file using . /// /// The catalog. + /// The path to the file. /// The columns of the schema. - /// Whether the file has a header. /// The character used as separator between data points in a row. By default the tab character is used as separator. - /// The path to the file. + /// Whether the file has a header. + /// Whether the file can contain numerical vectors in sparse format. + /// Whether the file can contain column defined by a quoted string. + /// Remove trailing whitespace from lines + /// The optional location of a data sample. The sample can be used to infer column names and number of slots in each column. /// The data view. public static IDataView ReadFromTextFile(this DataOperationsCatalog catalog, string path, TextLoader.Column[] columns, char separatorChar = TextLoader.Defaults.Separator, - bool hasHeader = TextLoader.Defaults.HasHeader) + bool hasHeader = TextLoader.Defaults.HasHeader, + bool allowSparse = TextLoader.Defaults.AllowSparse, + bool allowQuoting = TextLoader.Defaults.AllowQuoting, + bool trimWhitespace = TextLoader.Defaults.TrimWhitespace, + IMultiStreamSource dataSample = null) { Contracts.CheckNonEmpty(path, nameof(path)); var env = catalog.GetEnvironment(); - // REVIEW: it is almost always a mistake to have a 'trainable' text loader here. - // Therefore, we are going to disallow data sample. - var reader = new TextLoader(env, columns, separatorChar, hasHeader, dataSample: null); + var reader = new TextLoader(env, columns, separatorChar: separatorChar, hasHeader: hasHeader, allowSparse: allowSparse, + allowQuoting: allowQuoting, dataSample: dataSample, trimWhitespace: trimWhitespace); return reader.Read(new MultiFileSource(path)); } @@ -93,25 +102,25 @@ public static IDataView ReadFromTextFile(this DataOperationsCatalog catalog, /// Read a data view from a text file using . /// /// The catalog. - /// Does the file contains header? + /// The path to the file. /// Column separator character. Default is '\t' + /// Does the file contains header? + /// Whether the input may include sparse representations for example, + /// if one of the row contains "5 2:6 4:3" that's mean there are 5 columns all zero + /// except for 3rd and 5th columns which have values 6 and 3 /// Whether the input may include quoted values, /// which can contain separator characters, colons, /// and distinguish empty values from missing values. When true, consecutive separators /// denote a missing value and an empty value is denoted by \"\". /// When false, consecutive separators denote an empty value. - /// Whether the input may include sparse representations for example, - /// if one of the row contains "5 2:6 4:3" that's mean there are 5 columns all zero - /// except for 3rd and 5th columns which have values 6 and 3 /// Remove trailing whitespace from lines - /// The path to the file. /// The data view. public static IDataView ReadFromTextFile(this DataOperationsCatalog catalog, string path, char separatorChar = TextLoader.Defaults.Separator, bool hasHeader = TextLoader.Defaults.HasHeader, - bool allowQuoting = TextLoader.Defaults.AllowQuoting, bool allowSparse = TextLoader.Defaults.AllowSparse, + bool allowQuoting = TextLoader.Defaults.AllowQuoting, bool trimWhitespace = TextLoader.Defaults.TrimWhitespace) { Contracts.CheckNonEmpty(path, nameof(path)); @@ -128,14 +137,16 @@ public static IDataView ReadFromTextFile(this DataOperationsCatalog cata /// The catalog. /// Specifies a file from which to read. /// Defines the settings of the load operation. - public static IDataView ReadFromTextFile(this DataOperationsCatalog catalog, string path, TextLoader.Options options = null) + /// The optional location of a data sample. The sample can be used to infer column names and number of slots in each column. + public static IDataView ReadFromTextFile(this DataOperationsCatalog catalog, string path, + TextLoader.Options options = null, IMultiStreamSource dataSample = null) { Contracts.CheckNonEmpty(path, nameof(path)); var env = catalog.GetEnvironment(); var source = new MultiFileSource(path); - return new TextLoader(env, options, source).Read(source); + return new TextLoader(env, options, dataSample).Read(source); } /// From ebe7ca8cea278fef1663bf0d61864a6aad6cc01c Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Mon, 25 Feb 2019 11:41:01 -0800 Subject: [PATCH 2/4] Address comments --- .../DataLoadSave/Text/TextLoader.cs | 5 +- .../Text/TextLoaderSaverCatalog.cs | 47 ++++++++++--------- .../Datasets/MnistOneClass.cs | 6 +-- .../Datasets/TypeTestData.cs | 6 +-- .../Evaluation.cs | 2 +- .../Prediction.cs | 2 +- .../Validation.cs | 4 +- test/Microsoft.ML.Tests/TextLoaderTests.cs | 2 +- 8 files changed, 40 insertions(+), 34 deletions(-) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs index 872dd0d2a4..b511649705 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs @@ -1466,7 +1466,8 @@ internal static TextLoader CreateTextReader(IHostEnvironment host, char separator = Defaults.Separator, bool allowQuoting = Defaults.AllowQuoting, bool supportSparse = Defaults.AllowSparse, - bool trimWhitespace = Defaults.TrimWhitespace) + bool trimWhitespace = Defaults.TrimWhitespace, + IMultiStreamSource dataSample = null) { var userType = typeof(TInput); @@ -1527,7 +1528,7 @@ internal static TextLoader CreateTextReader(IHostEnvironment host, Columns = columns.ToArray() }; - return new TextLoader(host, options); + return new TextLoader(host, options, dataSample: dataSample); } private sealed class BoundLoader : IDataLoader diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs index ee52245d1e..0b05a2594c 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs @@ -18,18 +18,18 @@ public static class TextLoaderSaverCatalog /// Array of columns defining the schema. /// The character used as separator between data points in a row. By default the tab character is used as separator. /// Whether the file has a header. - /// Whether the file can contain numerical vectors in sparse format. + /// The optional location of a data sample. The sample can be used to infer column names and number of slots in each column. /// Whether the file can contain column defined by a quoted string. /// Remove trailing whitespace from lines - /// The optional location of a data sample. The sample can be used to infer column names and number of slots in each column. + /// Whether the file can contain numerical vectors in sparse format. public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog, TextLoader.Column[] columns, char separatorChar = TextLoader.Defaults.Separator, bool hasHeader = TextLoader.Defaults.HasHeader, - bool allowSparse = TextLoader.Defaults.AllowSparse, + IMultiStreamSource dataSample = null, bool allowQuoting = TextLoader.Defaults.AllowQuoting, bool trimWhitespace = TextLoader.Defaults.TrimWhitespace, - IMultiStreamSource dataSample = null) + bool allowSparse = TextLoader.Defaults.AllowSparse) => new TextLoader(CatalogUtils.GetEnvironment(catalog), columns, separatorChar, hasHeader, allowSparse, allowQuoting, dataSample, trimWhitespace); /// @@ -49,22 +49,25 @@ public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog, /// The catalog. /// Column separator character. Default is '\t' /// Does the file contains header? - /// Whether the input may include sparse representations for example, - /// if one of the row contains "5 2:6 4:3" that's mean there are 5 columns all zero - /// except for 3rd and 5th columns which have values 6 and 3 + /// The optional location of a data sample. The sample can be used to infer column names and number of slots in each column. /// Whether the input may include quoted values, /// which can contain separator characters, colons, /// and distinguish empty values from missing values. When true, consecutive separators /// denote a missing value and an empty value is denoted by \"\". /// When false, consecutive separators denote an empty value. /// Remove trailing whitespace from lines + /// Whether the input may include sparse representations for example, + /// if one of the row contains "5 2:6 4:3" that's mean there are 5 columns all zero + /// except for 3rd and 5th columns which have values 6 and 3 public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog, char separatorChar = TextLoader.Defaults.Separator, bool hasHeader = TextLoader.Defaults.HasHeader, - bool allowSparse = TextLoader.Defaults.AllowSparse, + IMultiStreamSource dataSample = null, bool allowQuoting = TextLoader.Defaults.AllowQuoting, - bool trimWhitespace = TextLoader.Defaults.TrimWhitespace) - => TextLoader.CreateTextReader(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar, allowQuoting, allowSparse, trimWhitespace); + bool trimWhitespace = TextLoader.Defaults.TrimWhitespace, + bool allowSparse = TextLoader.Defaults.AllowSparse) + => TextLoader.CreateTextReader(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar, allowQuoting, + allowSparse, trimWhitespace, dataSample: dataSample); /// /// Read a data view from a text file using . @@ -74,20 +77,20 @@ public static TextLoader CreateTextLoader(this DataOperationsCatalog cat /// The columns of the schema. /// The character used as separator between data points in a row. By default the tab character is used as separator. /// Whether the file has a header. - /// Whether the file can contain numerical vectors in sparse format. + /// The optional location of a data sample. The sample can be used to infer column names and number of slots in each column. /// Whether the file can contain column defined by a quoted string. /// Remove trailing whitespace from lines - /// The optional location of a data sample. The sample can be used to infer column names and number of slots in each column. + /// Whether the file can contain numerical vectors in sparse format. /// The data view. public static IDataView ReadFromTextFile(this DataOperationsCatalog catalog, string path, TextLoader.Column[] columns, char separatorChar = TextLoader.Defaults.Separator, bool hasHeader = TextLoader.Defaults.HasHeader, - bool allowSparse = TextLoader.Defaults.AllowSparse, + IMultiStreamSource dataSample = null, bool allowQuoting = TextLoader.Defaults.AllowQuoting, bool trimWhitespace = TextLoader.Defaults.TrimWhitespace, - IMultiStreamSource dataSample = null) + bool allowSparse = TextLoader.Defaults.AllowSparse) { Contracts.CheckNonEmpty(path, nameof(path)); @@ -105,30 +108,32 @@ public static IDataView ReadFromTextFile(this DataOperationsCatalog catalog, /// The path to the file. /// Column separator character. Default is '\t' /// Does the file contains header? - /// Whether the input may include sparse representations for example, - /// if one of the row contains "5 2:6 4:3" that's mean there are 5 columns all zero - /// except for 3rd and 5th columns which have values 6 and 3 + /// The optional location of a data sample. The sample can be used to infer column names and number of slots in each column. /// Whether the input may include quoted values, /// which can contain separator characters, colons, /// and distinguish empty values from missing values. When true, consecutive separators /// denote a missing value and an empty value is denoted by \"\". /// When false, consecutive separators denote an empty value. /// Remove trailing whitespace from lines + /// Whether the input may include sparse representations for example, + /// if one of the row contains "5 2:6 4:3" that's mean there are 5 columns all zero + /// except for 3rd and 5th columns which have values 6 and 3 /// The data view. public static IDataView ReadFromTextFile(this DataOperationsCatalog catalog, string path, char separatorChar = TextLoader.Defaults.Separator, bool hasHeader = TextLoader.Defaults.HasHeader, - bool allowSparse = TextLoader.Defaults.AllowSparse, + IMultiStreamSource dataSample = null, bool allowQuoting = TextLoader.Defaults.AllowQuoting, - bool trimWhitespace = TextLoader.Defaults.TrimWhitespace) + bool trimWhitespace = TextLoader.Defaults.TrimWhitespace, + bool allowSparse = TextLoader.Defaults.AllowSparse) { Contracts.CheckNonEmpty(path, nameof(path)); // REVIEW: it is almost always a mistake to have a 'trainable' text loader here. // Therefore, we are going to disallow data sample. - return TextLoader.CreateTextReader(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar, allowQuoting, allowSparse, trimWhitespace) - .Read(new MultiFileSource(path)); + return TextLoader.CreateTextReader(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar, + allowQuoting, allowSparse, trimWhitespace, dataSample: dataSample).Read(new MultiFileSource(path)); } /// diff --git a/test/Microsoft.ML.Functional.Tests/Datasets/MnistOneClass.cs b/test/Microsoft.ML.Functional.Tests/Datasets/MnistOneClass.cs index 0a83091fd8..6329a80b0b 100644 --- a/test/Microsoft.ML.Functional.Tests/Datasets/MnistOneClass.cs +++ b/test/Microsoft.ML.Functional.Tests/Datasets/MnistOneClass.cs @@ -21,9 +21,9 @@ public static TextLoader GetTextLoader(MLContext mlContext, bool hasHeader, char new TextLoader.Column("Label", DataKind.Single, 0), new TextLoader.Column("Features", DataKind.Single, 1, 1 + _featureLength) }, - separatorChar: separatorChar, - hasHeader: hasHeader, - allowSparse: true); + separatorChar: separatorChar, + hasHeader: hasHeader, + allowSparse: true); } } } diff --git a/test/Microsoft.ML.Functional.Tests/Datasets/TypeTestData.cs b/test/Microsoft.ML.Functional.Tests/Datasets/TypeTestData.cs index 223320cb0a..6b87a3947f 100644 --- a/test/Microsoft.ML.Functional.Tests/Datasets/TypeTestData.cs +++ b/test/Microsoft.ML.Functional.Tests/Datasets/TypeTestData.cs @@ -96,9 +96,9 @@ public static TextLoader GetTextLoader(MLContext mlContext, char separator) new TextLoader.Column("Dz", DataKind.DateTimeOffset, 14), new TextLoader.Column("Features", DataKind.Single, 15, 15 + _numFeatures - 1), }, - separatorChar: separator, - hasHeader: true, - allowQuoting: true); + separatorChar: separator, + hasHeader: true, + allowQuoting: true); } /// diff --git a/test/Microsoft.ML.Functional.Tests/Evaluation.cs b/test/Microsoft.ML.Functional.Tests/Evaluation.cs index 6ffec01b32..1bedbe1dc4 100644 --- a/test/Microsoft.ML.Functional.Tests/Evaluation.cs +++ b/test/Microsoft.ML.Functional.Tests/Evaluation.cs @@ -240,7 +240,7 @@ public void TrainAndEvaluateRegression() // Get the dataset. var data = mlContext.Data.CreateTextLoader(TestDatasets.housing.GetLoaderColumns(), - hasHeader: TestDatasets.housing.fileHasHeader, separatorChar: TestDatasets.housing.fileSeparator) + hasHeader: TestDatasets.housing.fileHasHeader, separatorChar: TestDatasets.housing.fileSeparator) .Read(GetDataPath(TestDatasets.housing.trainFilename)); // Create a pipeline to train on the sentiment data. diff --git a/test/Microsoft.ML.Functional.Tests/Prediction.cs b/test/Microsoft.ML.Functional.Tests/Prediction.cs index 74ec111c92..41740157ea 100644 --- a/test/Microsoft.ML.Functional.Tests/Prediction.cs +++ b/test/Microsoft.ML.Functional.Tests/Prediction.cs @@ -23,7 +23,7 @@ public void ReconfigurablePrediction() // Get the dataset, create a train and test var data = mlContext.Data.CreateTextLoader(TestDatasets.housing.GetLoaderColumns(), - hasHeader: TestDatasets.housing.fileHasHeader, separatorChar: TestDatasets.housing.fileSeparator) + hasHeader: TestDatasets.housing.fileHasHeader, separatorChar: TestDatasets.housing.fileSeparator) .Read(BaseTestClass.GetDataPath(TestDatasets.housing.trainFilename)); var split = mlContext.BinaryClassification.TrainTestSplit(data, testFraction: 0.2); diff --git a/test/Microsoft.ML.Functional.Tests/Validation.cs b/test/Microsoft.ML.Functional.Tests/Validation.cs index ed1cccbf7c..09cb8e9e34 100644 --- a/test/Microsoft.ML.Functional.Tests/Validation.cs +++ b/test/Microsoft.ML.Functional.Tests/Validation.cs @@ -28,7 +28,7 @@ void CrossValidation() // Get the dataset. var data = mlContext.Data.CreateTextLoader(TestDatasets.housing.GetLoaderColumns(), - hasHeader: TestDatasets.housing.fileHasHeader, separatorChar: TestDatasets.housing.fileSeparator) + hasHeader: TestDatasets.housing.fileHasHeader, separatorChar: TestDatasets.housing.fileSeparator) .Read(BaseTestClass.GetDataPath(TestDatasets.housing.trainFilename)); // Create a pipeline to train on the sentiment data. @@ -62,7 +62,7 @@ public void TrainWithValidationSet() // Get the dataset. var data = mlContext.Data.CreateTextLoader(TestDatasets.housing.GetLoaderColumns(), - hasHeader: TestDatasets.housing.fileHasHeader, separatorChar: TestDatasets.housing.fileSeparator) + hasHeader: TestDatasets.housing.fileHasHeader, separatorChar: TestDatasets.housing.fileSeparator) .Read(BaseTestClass.GetDataPath(TestDatasets.housing.trainFilename)); var dataSplit = mlContext.Regression.TrainTestSplit(data, testFraction: 0.2); var trainData = dataSplit.TrainSet; diff --git a/test/Microsoft.ML.Tests/TextLoaderTests.cs b/test/Microsoft.ML.Tests/TextLoaderTests.cs index b8cb8afadd..0adbb29eab 100644 --- a/test/Microsoft.ML.Tests/TextLoaderTests.cs +++ b/test/Microsoft.ML.Tests/TextLoaderTests.cs @@ -149,7 +149,7 @@ public void ConstructorDoesntThrow() Assert.NotNull(mlContext.Data.ReadFromTextFile("fakeFile.txt")); Assert.NotNull(mlContext.Data.ReadFromTextFile("fakeFile.txt", hasHeader: true)); Assert.NotNull(mlContext.Data.ReadFromTextFile("fakeFile.txt", hasHeader: false)); - Assert.NotNull(mlContext.Data.ReadFromTextFile("fakeFile.txt", hasHeader: false, allowSparse: false, trimWhitespace: false)); + Assert.NotNull(mlContext.Data.ReadFromTextFile("fakeFile.txt", hasHeader: false, trimWhitespace: false, allowSparse: false)); Assert.NotNull(mlContext.Data.ReadFromTextFile("fakeFile.txt", hasHeader: false, allowSparse: false)); Assert.NotNull(mlContext.Data.ReadFromTextFile("fakeFile.txt", hasHeader: false, allowQuoting: false)); Assert.NotNull(mlContext.Data.ReadFromTextFile("fakeFile.txt")); From 96d561e3cc631c25db13f84393b5c07f01d8e090 Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Mon, 25 Feb 2019 13:51:16 -0800 Subject: [PATCH 3/4] Remove a redundant ctor of TextLoader --- .../DataLoadSave/Text/TextLoader.cs | 26 -------- .../Text/TextLoaderSaverCatalog.cs | 27 ++++++-- .../ValueToKeyMappingTransformer.cs | 17 +++-- .../Text/StopWordsRemovingTransformer.cs | 14 ++-- .../PredictionEngineBench.cs | 64 +++++++++++-------- ...sticDualCoordinateAscentClassifierBench.cs | 50 ++++++++------- .../TrainerEstimators/MetalinearEstimators.cs | 11 +++- 7 files changed, 122 insertions(+), 87 deletions(-) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs index b511649705..55fc8a99db 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs @@ -1085,32 +1085,6 @@ private bool HasHeader private readonly IHost _host; private const string RegistrationName = "TextLoader"; - /// - /// Loads a text file into an . Supports basic mapping from input columns to IDataView columns. - /// - /// The environment to use. - /// Defines a mapping between input columns in the file and IDataView columns. - /// The character used as separator between data points in a row. By default the tab character is used as separator. - /// Whether the file has a header. - /// Whether the file can contain numerical vectors in sparse format. - /// Whether the content of a column can be parsed from a string starting and ending with quote. - /// Allows to expose items that can be used for reading. - /// Remove trailing whitespace from lines. - internal TextLoader(IHostEnvironment env, Column[] columns, char separatorChar = Defaults.Separator, - bool hasHeader = Defaults.HasHeader, bool allowSparse = Defaults.AllowSparse, - bool allowQuoting = Defaults.AllowQuoting, IMultiStreamSource dataSample = null, bool trimWhitespace = Defaults.TrimWhitespace) - : this(env, MakeArgs(columns, hasHeader, new[] { separatorChar }, allowSparse, allowQuoting, trimWhitespace), dataSample) - { - } - - private static Options MakeArgs(Column[] columns, bool hasHeader, char[] separatorChars, bool allowSparse, bool allowQuoting, bool trimWhitespace) - { - Contracts.AssertValue(separatorChars); - var result = new Options { Columns = columns, HasHeader = hasHeader, Separators = separatorChars, - AllowSparse = allowSparse, AllowQuoting = allowQuoting, TrimWhitespace = trimWhitespace }; - return result; - } - /// /// Loads a text file into an . Supports basic mapping from input columns to IDataView columns. /// diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs index 0b05a2594c..47b0d36061 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs @@ -30,7 +30,19 @@ public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog, bool allowQuoting = TextLoader.Defaults.AllowQuoting, bool trimWhitespace = TextLoader.Defaults.TrimWhitespace, bool allowSparse = TextLoader.Defaults.AllowSparse) - => new TextLoader(CatalogUtils.GetEnvironment(catalog), columns, separatorChar, hasHeader, allowSparse, allowQuoting, dataSample, trimWhitespace); + { + var options = new TextLoader.Options + { + Columns = columns, + Separators = new[] { separatorChar }, + HasHeader = hasHeader, + AllowQuoting = allowQuoting, + TrimWhitespace = trimWhitespace, + AllowSparse = allowSparse + }; + + return new TextLoader(CatalogUtils.GetEnvironment(catalog), options: options, dataSample: dataSample); + } /// /// Create a text loader . @@ -94,10 +106,17 @@ public static IDataView ReadFromTextFile(this DataOperationsCatalog catalog, { Contracts.CheckNonEmpty(path, nameof(path)); - var env = catalog.GetEnvironment(); + var options = new TextLoader.Options + { + Columns = columns, + Separators = new[] { separatorChar }, + HasHeader = hasHeader, + AllowQuoting = allowQuoting, + TrimWhitespace = trimWhitespace, + AllowSparse = allowSparse + }; - var reader = new TextLoader(env, columns, separatorChar: separatorChar, hasHeader: hasHeader, allowSparse: allowSparse, - allowQuoting: allowQuoting, dataSample: dataSample, trimWhitespace: trimWhitespace); + var reader = new TextLoader(CatalogUtils.GetEnvironment(catalog), options: options, dataSample: dataSample); return reader.Read(new MultiFileSource(path)); } diff --git a/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingTransformer.cs b/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingTransformer.cs index aaba6cdebf..b3c22a057f 100644 --- a/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingTransformer.cs +++ b/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingTransformer.cs @@ -439,10 +439,19 @@ internal static IDataView GetKeyDataViewOrNull(IHostEnvironment env, IChannel ch "{0} should not be specified when default loader is " + nameof(TextLoader) + ". Ignoring {0}={1}", nameof(Options.TermsColumn), src); } - keyData = new TextLoader(env, - columns: new[] { new TextLoader.Column("Term", DataKind.String, 0) }, - dataSample: fileSource) - .Read(fileSource); + + // Create text loader. + var options = new TextLoader.Options() + { + Columns = new[] + { + new TextLoader.Column("Term", DataKind.String, 0) + } + }; + var reader = new TextLoader(env, options: options, dataSample: fileSource); + + keyData = reader.Read(fileSource); + src = "Term"; // In this case they are relying on heuristics, so auto-loading in this case is most appropriate. autoConvert = true; diff --git a/src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs b/src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs index 9c5015722f..a14c455dce 100644 --- a/src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs +++ b/src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs @@ -737,13 +737,19 @@ private IDataLoader GetLoaderForStopwords(IChannel ch, string dataFile, { if (stopwordsCol == null) stopwordsCol = "Stopwords"; - dataLoader = new TextLoader( - Host, - columns: new[] + + // Create text loader. + var options = new TextLoader.Options() + { + Columns = new[] { new TextLoader.Column(stopwordsCol, DataKind.String, 0) }, - dataSample: fileSource).Read(fileSource) as IDataLoader; + Separators = new[] { ',' }, + }; + var reader = new TextLoader(Host, options: options, dataSample: fileSource); + + dataLoader = reader.Read(fileSource) as IDataLoader; } ch.AssertNonEmpty(stopwordsCol); } diff --git a/test/Microsoft.ML.Benchmarks/PredictionEngineBench.cs b/test/Microsoft.ML.Benchmarks/PredictionEngineBench.cs index be3dc58b40..97fce498c9 100644 --- a/test/Microsoft.ML.Benchmarks/PredictionEngineBench.cs +++ b/test/Microsoft.ML.Benchmarks/PredictionEngineBench.cs @@ -39,17 +39,21 @@ public void SetupIrisPipeline() string _irisDataPath = BaseTestClass.GetDataPath("iris.txt"); var env = new MLContext(seed: 1, conc: 1); - var reader = new TextLoader(env, - columns: new[] - { - new TextLoader.Column("Label", DataKind.Single, 0), - new TextLoader.Column("SepalLength", DataKind.Single, 1), - new TextLoader.Column("SepalWidth", DataKind.Single, 2), - new TextLoader.Column("PetalLength", DataKind.Single, 3), - new TextLoader.Column("PetalWidth", DataKind.Single, 4), - }, - hasHeader: true - ); + + // Create text loader. + var options = new TextLoader.Options() + { + Columns = new[] + { + new TextLoader.Column("Label", DataKind.Single, 0), + new TextLoader.Column("SepalLength", DataKind.Single, 1), + new TextLoader.Column("SepalWidth", DataKind.Single, 2), + new TextLoader.Column("PetalLength", DataKind.Single, 3), + new TextLoader.Column("PetalWidth", DataKind.Single, 4), + }, + HasHeader = true, + }; + var reader = new TextLoader(env, options: options); IDataView data = reader.Read(_irisDataPath); @@ -73,13 +77,18 @@ public void SetupSentimentPipeline() string _sentimentDataPath = BaseTestClass.GetDataPath("wikipedia-detox-250-line-data.tsv"); var mlContext = new MLContext(seed: 1, conc: 1); - var reader = new TextLoader(mlContext, columns: new[] - { - new TextLoader.Column("Label", DataKind.Boolean, 0), - new TextLoader.Column("SentimentText", DataKind.String, 1) - }, - hasHeader: true - ); + + // Create text loader. + var options = new TextLoader.Options() + { + Columns = new[] + { + new TextLoader.Column("Label", DataKind.Boolean, 0), + new TextLoader.Column("SentimentText", DataKind.String, 1) + }, + HasHeader = true, + }; + var reader = new TextLoader(mlContext, options: options); IDataView data = reader.Read(_sentimentDataPath); @@ -103,13 +112,18 @@ public void SetupBreastCancerPipeline() string _breastCancerDataPath = BaseTestClass.GetDataPath("breast-cancer.txt"); var env = new MLContext(seed: 1, conc: 1); - var reader = new TextLoader(env, columns: new[] - { - new TextLoader.Column("Label", DataKind.Boolean, 0), - new TextLoader.Column("Features", DataKind.Single, new[] { new TextLoader.Range(1, 9) }) - }, - hasHeader: false - ); + + // Create text loader. + var options = new TextLoader.Options() + { + Columns = new[] + { + new TextLoader.Column("Label", DataKind.Boolean, 0), + new TextLoader.Column("Features", DataKind.Single, new[] { new TextLoader.Range(1, 9) }) + }, + HasHeader = false, + }; + var reader = new TextLoader(env, options: options); IDataView data = reader.Read(_breastCancerDataPath); diff --git a/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs b/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs index 00571ad3f9..f771f13b14 100644 --- a/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs +++ b/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs @@ -53,17 +53,20 @@ protected override IEnumerable GetMetrics() private TransformerChain> Train(string dataPath) { - var reader = new TextLoader(mlContext, - columns: new[] - { - new TextLoader.Column("Label", DataKind.Single, 0), - new TextLoader.Column("SepalLength", DataKind.Single, 1), - new TextLoader.Column("SepalWidth", DataKind.Single, 2), - new TextLoader.Column("PetalLength", DataKind.Single, 3), - new TextLoader.Column("PetalWidth", DataKind.Single, 4), - }, - hasHeader: true - ); + // Create text loader. + var options = new TextLoader.Options() + { + Columns = new[] + { + new TextLoader.Column("Label", DataKind.Single, 0), + new TextLoader.Column("SepalLength", DataKind.Single, 1), + new TextLoader.Column("SepalWidth", DataKind.Single, 2), + new TextLoader.Column("PetalLength", DataKind.Single, 3), + new TextLoader.Column("PetalWidth", DataKind.Single, 4), + }, + HasHeader = true, + }; + var reader = new TextLoader(mlContext, options: options); IDataView data = reader.Read(dataPath); @@ -116,17 +119,20 @@ public void SetupPredictBenchmarks() _predictionEngine = _trainedModel.CreatePredictionEngine(mlContext); _consumer.Consume(_predictionEngine.Predict(_example)); - var reader = new TextLoader(mlContext, - columns: new[] - { - new TextLoader.Column("Label", DataKind.Single, 0), - new TextLoader.Column("SepalLength", DataKind.Single, 1), - new TextLoader.Column("SepalWidth", DataKind.Single, 2), - new TextLoader.Column("PetalLength", DataKind.Single, 3), - new TextLoader.Column("PetalWidth", DataKind.Single, 4), - }, - hasHeader: true - ); + // Create text loader. + var options = new TextLoader.Options() + { + Columns = new[] + { + new TextLoader.Column("Label", DataKind.Single, 0), + new TextLoader.Column("SepalLength", DataKind.Single, 1), + new TextLoader.Column("SepalWidth", DataKind.Single, 2), + new TextLoader.Column("PetalLength", DataKind.Single, 3), + new TextLoader.Column("PetalWidth", DataKind.Single, 4), + }, + HasHeader = true, + }; + var reader = new TextLoader(mlContext, options: options); IDataView testData = reader.Read(_dataPath); IDataView scoredTestData = _trainedModel.Transform(testData); diff --git a/test/Microsoft.ML.Tests/TrainerEstimators/MetalinearEstimators.cs b/test/Microsoft.ML.Tests/TrainerEstimators/MetalinearEstimators.cs index bd08dc37a6..0f23807334 100644 --- a/test/Microsoft.ML.Tests/TrainerEstimators/MetalinearEstimators.cs +++ b/test/Microsoft.ML.Tests/TrainerEstimators/MetalinearEstimators.cs @@ -74,8 +74,15 @@ public void Pkpd() [Fact] public void MetacomponentsFeaturesRenamed() { - var data = new TextLoader(Env, TestDatasets.irisData.GetLoaderColumns(), separatorChar: ',') - .Read(GetDataPath(TestDatasets.irisData.trainFilename)); + // Create text loader. + var options = new TextLoader.Options() + { + Columns = TestDatasets.irisData.GetLoaderColumns(), + Separators = new[] { ',' }, + }; + var reader = new TextLoader(Env, options: options); + + var data = reader.Read(GetDataPath(TestDatasets.irisData.trainFilename)); var sdcaTrainer = ML.BinaryClassification.Trainers.StochasticDualCoordinateAscentNonCalibrated( new SdcaNonCalibratedBinaryTrainer.Options { From ead2583955179ad702eca998fe10ce5989ca3d1b Mon Sep 17 00:00:00 2001 From: Wei-Sheng Chin Date: Mon, 25 Feb 2019 14:54:05 -0800 Subject: [PATCH 4/4] Fix tests --- .../DataLoadSave/Text/TextLoaderSaverCatalog.cs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs index 47b0d36061..d6cd3a2633 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs @@ -170,7 +170,10 @@ public static IDataView ReadFromTextFile(this DataOperationsCatalog catalog, str var env = catalog.GetEnvironment(); var source = new MultiFileSource(path); - return new TextLoader(env, options, dataSample).Read(source); + if (dataSample == null) + return new TextLoader(env, options, source).Read(source); + else + return new TextLoader(env, options, dataSample).Read(source); } ///