diff --git a/docs/code/MlNetCookBook.md b/docs/code/MlNetCookBook.md index 7ff7b1f8eb..87e8b82647 100644 --- a/docs/code/MlNetCookBook.md +++ b/docs/code/MlNetCookBook.md @@ -95,7 +95,7 @@ This is how you can read this data: var mlContext = new MLContext(); // Create the reader: define the data columns and where to find them in the text file. -var reader = mlContext.Data.TextReader(ctx => ( +var reader = mlContext.Data.CreateTextReader(ctx => ( // A boolean column depicting the 'target label'. IsOver50K: ctx.LoadBool(0), // Three text columns. @@ -115,9 +115,7 @@ If the schema of the data is not known at compile time, or too cumbersome, you c var mlContext = new MLContext(); // Create the reader: define the data columns and where to find them in the text file. -var reader = mlContext.Data.TextReader(new TextLoader.Arguments -{ - Column = new[] { +var reader = mlContext.Data.CreateTextReader(new[] { // A boolean column depicting the 'label'. new TextLoader.Column("IsOver50K", DataKind.BL, 0), // Three text columns. @@ -126,8 +124,8 @@ var reader = mlContext.Data.TextReader(new TextLoader.Arguments new TextLoader.Column("MaritalStatus", DataKind.TX, 3) }, // First line of the file is a header, not a data row. - HasHeader = true -}); + hasHeader: true +); // Now read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed). var data = reader.Read(dataPath); @@ -155,7 +153,7 @@ This is how you can read this data: var mlContext = new MLContext(); // Create the reader: define the data columns and where to find them in the text file. -var reader = mlContext.Data.TextReader(ctx => ( +var reader = mlContext.Data.CreateTextReader(ctx => ( // A boolean column depicting the 'target label'. IsOver50K: ctx.LoadBool(14), // Three text columns. @@ -175,19 +173,17 @@ The code is very similar using the dynamic API: var mlContext = new MLContext(); // Create the reader: define the data columns and where to find them in the text file. -var reader = mlContext.Data.TextReader(new TextLoader.Arguments -{ - Column = new[] { +var reader = mlContext.Data.CreateTextReader(new[] { // A boolean column depicting the 'label'. - new TextLoader.Column("IsOver50k", DataKind.BL, 0), + new TextLoader.Column("IsOver50K", DataKind.BL, 0), // Three text columns. new TextLoader.Column("Workclass", DataKind.TX, 1), new TextLoader.Column("Education", DataKind.TX, 2), new TextLoader.Column("MaritalStatus", DataKind.TX, 3) }, // First line of the file is a header, not a data row. - HasHeader = true -}); + hasHeader: true +); var data = reader.Read(exampleFile1, exampleFile2); ``` @@ -211,7 +207,7 @@ Reading this file using `TextLoader`: var mlContext = new MLContext(); // Create the reader: define the data columns and where to find them in the text file. -var reader = mlContext.Data.TextReader(ctx => ( +var reader = mlContext.Data.CreateTextReader(ctx => ( // We read the first 11 values as a single float vector. FeatureVector: ctx.LoadFloat(0, 10), // Separately, read the target variable. @@ -233,7 +229,7 @@ If the schema of the data is not known at compile time, or too cumbersome, you c var mlContext = new MLContext(); // Create the reader: define the data columns and where to find them in the text file. -var reader = mlContext.Data.TextReader(new[] { +var reader = mlContext.Data.CreateTextReader(new[] { // We read the first 10 values as a single float vector. new TextLoader.Column("FeatureVector", DataKind.R4, new[] {new TextLoader.Range(0, 9)}), // Separately, read the target variable. @@ -302,7 +298,7 @@ Label Workclass education marital-status var mlContext = new MLContext(); // Create the reader: define the data columns and where to find them in the text file. -var reader = mlContext.Data.TextReader(ctx => ( +var reader = mlContext.Data.CreateTextReader(ctx => ( // A boolean column depicting the 'target label'. IsOver50K: ctx.LoadBool(0), // Three text columns. @@ -365,19 +361,17 @@ You can also use the dynamic API to create the equivalent of the previous pipeli var mlContext = new MLContext(); // Create the reader: define the data columns and where to find them in the text file. -var reader = mlContext.Data.TextReader(new TextLoader.Arguments -{ - Column = new[] { +var reader = mlContext.Data.CreateTextReader(new[] { // A boolean column depicting the 'label'. - new TextLoader.Column("IsOver50k", DataKind.BL, 0), + new TextLoader.Column("IsOver50K", DataKind.BL, 0), // Three text columns. new TextLoader.Column("Workclass", DataKind.TX, 1), new TextLoader.Column("Education", DataKind.TX, 2), new TextLoader.Column("MaritalStatus", DataKind.TX, 3) }, // First line of the file is a header, not a data row. - HasHeader = true -}); + hasHeader: true +); // Start creating our processing pipeline. For now, let's just concatenate all the text columns // together into one. @@ -428,7 +422,7 @@ var mlContext = new MLContext(); // Step one: read the data as an IDataView. // First, we define the reader: specify the data columns and where to find them in the text file. -var reader = mlContext.Data.TextReader(ctx => ( +var reader = mlContext.Data.CreateTextReader(ctx => ( // We read the first 11 values as a single float vector. FeatureVector: ctx.LoadFloat(0, 10), // Separately, read the target variable. @@ -482,9 +476,7 @@ var mlContext = new MLContext(); // Step one: read the data as an IDataView. // First, we define the reader: specify the data columns and where to find them in the text file. -var reader = mlContext.Data.TextReader(new TextLoader.Arguments -{ - Column = new[] { +var reader = mlContext.Data.CreateTextReader(new[] { // We read the first 11 values as a single float vector. new TextLoader.Column("FeatureVector", DataKind.R4, 0, 10), @@ -492,10 +484,10 @@ var reader = mlContext.Data.TextReader(new TextLoader.Arguments new TextLoader.Column("Target", DataKind.R4, 11), }, // First line of the file is a header, not a data row. - HasHeader = true, + hasHeader: true, // Default separator is tab, but we need a semicolon. - Separator = ";" -}); + separatorChar: ';' +); // Now read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed). var trainData = reader.Read(trainDataPath); @@ -603,7 +595,7 @@ var mlContext = new MLContext(); // Step one: read the data as an IDataView. // First, we define the reader: specify the data columns and where to find them in the text file. -var reader = mlContext.Data.TextReader(ctx => ( +var reader = mlContext.Data.CreateTextReader(ctx => ( // The four features of the Iris dataset. SepalLength: ctx.LoadFloat(0), SepalWidth: ctx.LoadFloat(1), @@ -653,9 +645,7 @@ var mlContext = new MLContext(); // Step one: read the data as an IDataView. // First, we define the reader: specify the data columns and where to find them in the text file. -var reader = mlContext.Data.TextReader(new TextLoader.Arguments -{ - Column = new[] { +var reader = mlContext.Data.CreateTextReader(new[] { new TextLoader.Column("SepalLength", DataKind.R4, 0), new TextLoader.Column("SepalWidth", DataKind.R4, 1), new TextLoader.Column("PetalLength", DataKind.R4, 2), @@ -664,8 +654,8 @@ var reader = mlContext.Data.TextReader(new TextLoader.Arguments new TextLoader.Column("Label", DataKind.TX, 4), }, // Default separator is tab, but the dataset has comma. - Separator = "," -}); + separatorChar: ',' +); // Retrieve the training data. var trainData = reader.Read(irisDataPath); @@ -821,7 +811,7 @@ var mlContext = new MLContext(); // Step one: read the data as an IDataView. // First, we define the reader: specify the data columns and where to find them in the text file. -var reader = mlContext.Data.TextReader(ctx => ( +var reader = mlContext.Data.CreateTextReader(ctx => ( // The four features of the Iris dataset. SepalLength: ctx.LoadFloat(0), SepalWidth: ctx.LoadFloat(1), @@ -917,7 +907,7 @@ Here's a snippet of code that demonstrates normalization in learning pipelines. var mlContext = new MLContext(); // Define the reader: specify the data columns and where to find them in the text file. -var reader = mlContext.Data.TextReader(ctx => ( +var reader = mlContext.Data.CreateTextReader(ctx => ( // The four features of the Iris dataset will be grouped together as one Features column. Features: ctx.LoadFloat(0, 3), // Label: kind of iris. @@ -952,17 +942,15 @@ You can achieve the same results using the dynamic API. var mlContext = new MLContext(); // Define the reader: specify the data columns and where to find them in the text file. -var reader = mlContext.Data.TextReader(new TextLoader.Arguments -{ - Column = new[] { +var reader = mlContext.Data.CreateTextReader(new[] { // The four features of the Iris dataset will be grouped together as one Features column. new TextLoader.Column("Features", DataKind.R4, 0, 3), // Label: kind of iris. new TextLoader.Column("Label", DataKind.TX, 4), }, // Default separator is tab, but the dataset has comma. - Separator = "," -}); + separatorChar: ',' +); // Read the training data. var trainData = reader.Read(dataPath); @@ -1011,7 +999,7 @@ Label Workclass education marital-status occupation relationship ethnicity sex n var mlContext = new MLContext(); // Define the reader: specify the data columns and where to find them in the text file. -var reader = mlContext.Data.TextReader(ctx => ( +var reader = mlContext.Data.CreateTextReader(ctx => ( Label: ctx.LoadBool(0), // We will load all the categorical features into one vector column of size 8. CategoricalFeatures: ctx.LoadText(1, 8), @@ -1073,9 +1061,8 @@ You can achieve the same results using the dynamic API. var mlContext = new MLContext(); // Define the reader: specify the data columns and where to find them in the text file. -var reader = mlContext.Data.TextReader(new TextLoader.Arguments -{ - Column = new[] { +var reader = mlContext.Data.CreateTextReader(new[] + { new TextLoader.Column("Label", DataKind.BL, 0), // We will load all the categorical features into one vector column of size 8. new TextLoader.Column("CategoricalFeatures", DataKind.TX, 1, 8), @@ -1084,8 +1071,8 @@ var reader = mlContext.Data.TextReader(new TextLoader.Arguments // Let's also separately load the 'Workclass' column. new TextLoader.Column("Workclass", DataKind.TX, 1), }, - HasHeader = true -}); + hasHeader: true +); // Read the data. var data = reader.Read(dataPath); @@ -1157,7 +1144,7 @@ Sentiment SentimentText var mlContext = new MLContext(); // Define the reader: specify the data columns and where to find them in the text file. -var reader = mlContext.Data.TextReader(ctx => ( +var reader = mlContext.Data.CreateTextReader(ctx => ( IsToxic: ctx.LoadBool(0), Message: ctx.LoadText(1) ), hasHeader: true); @@ -1207,14 +1194,13 @@ You can achieve the same results using the dynamic API. var mlContext = new MLContext(); // Define the reader: specify the data columns and where to find them in the text file. -var reader = mlContext.Data.TextReader(new TextLoader.Arguments -{ - Column = new[] { +var reader = mlContext.Data.CreateTextReader(new[] + { new TextLoader.Column("IsToxic", DataKind.BL, 0), new TextLoader.Column("Message", DataKind.TX, 1), }, - HasHeader = true -}); + hasHeader: true +); // Read the data. var data = reader.Read(dataPath); @@ -1274,7 +1260,7 @@ var mlContext = new MLContext(); // Step one: read the data as an IDataView. // First, we define the reader: specify the data columns and where to find them in the text file. -var reader = mlContext.Data.TextReader(ctx => ( +var reader = mlContext.Data.CreateTextReader(ctx => ( // The four features of the Iris dataset. SepalLength: ctx.LoadFloat(0), SepalWidth: ctx.LoadFloat(1), @@ -1330,9 +1316,8 @@ var mlContext = new MLContext(); // Step one: read the data as an IDataView. // First, we define the reader: specify the data columns and where to find them in the text file. -var reader = mlContext.Data.TextReader(new TextLoader.Arguments -{ - Column = new[] { +var reader = mlContext.Data.CreateTextReader(new[] + { // We read the first 11 values as a single float vector. new TextLoader.Column("SepalLength", DataKind.R4, 0), new TextLoader.Column("SepalWidth", DataKind.R4, 1), @@ -1342,8 +1327,8 @@ var reader = mlContext.Data.TextReader(new TextLoader.Arguments new TextLoader.Column("Label", DataKind.TX, 4), }, // Default separator is tab, but the dataset has comma. - Separator = "," -}); + separatorChar: ',' +); // Read the data. var data = reader.Read(dataPath); @@ -1395,7 +1380,7 @@ var mlContext = new MLContext(); // Read the data as an IDataView. // First, we define the reader: specify the data columns and where to find them in the text file. -var reader = mlContext.Data.TextReader(ctx => ( +var reader = mlContext.Data.CreateTextReader(ctx => ( // The four features of the Iris dataset. SepalLength: ctx.LoadFloat(0), SepalWidth: ctx.LoadFloat(1), diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/FeatureContributionCalculationTransform.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/FeatureContributionCalculationTransform.cs index 1c28df327f..d88cff2cdd 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/FeatureContributionCalculationTransform.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/FeatureContributionCalculationTransform.cs @@ -19,11 +19,8 @@ public static void FeatureContributionCalculationTransform_Regression() // Step 1: Read the data as an IDataView. // First, we define the reader: specify the data columns and where to find them in the text file. - var reader = mlContext.Data.TextReader(new TextLoader.Arguments() - { - Separator = "tab", - HasHeader = true, - Column = new[] + var reader = mlContext.Data.CreateTextReader( + columns: new[] { new TextLoader.Column("MedianHomeValue", DataKind.R4, 0), new TextLoader.Column("CrimesPerCapita", DataKind.R4, 1), @@ -37,8 +34,9 @@ public static void FeatureContributionCalculationTransform_Regression() new TextLoader.Column("HighwayDistance", DataKind.R4, 9), new TextLoader.Column("TaxRate", DataKind.R4, 10), new TextLoader.Column("TeacherRatio", DataKind.R4, 11), - } - }); + }, + hasHeader: true + ); // Read the data var data = reader.Read(dataFile); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/FeatureSelectionTransform.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/FeatureSelectionTransform.cs index 7508815dc4..f0d0442d42 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/FeatureSelectionTransform.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/FeatureSelectionTransform.cs @@ -31,16 +31,14 @@ public static void FeatureSelectionTransform() // First, we define the reader: specify the data columns and where to find them in the text file. Notice that we combine entries from // all the feature columns into entries of a vector of a single column named "Features". - var reader = ml.Data.TextReader(new TextLoader.Arguments() - { - Separator = "tab", - HasHeader = true, - Column = new[] + var reader = ml.Data.CreateTextReader( + columns: new[] { new TextLoader.Column("Label", DataKind.BL, 0), new TextLoader.Column("Features", DataKind.Num, new [] { new TextLoader.Range(1, 9) }) - } - }); + }, + hasHeader: true + ); // Then, we use the reader to read the data as an IDataView. var data = reader.Read(dataFilePath); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/GeneralizedAdditiveModels.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/GeneralizedAdditiveModels.cs index 827d04a586..26fadc0148 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/GeneralizedAdditiveModels.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/GeneralizedAdditiveModels.cs @@ -19,11 +19,8 @@ public static void RunExample() // Step 1: Read the data as an IDataView. // First, we define the reader: specify the data columns and where to find them in the text file. - var reader = mlContext.Data.TextReader(new TextLoader.Arguments() - { - Separator = "tab", - HasHeader = true, - Column = new[] + var reader = mlContext.Data.CreateTextReader( + columns: new[] { new TextLoader.Column("MedianHomeValue", DataKind.R4, 0), new TextLoader.Column("CrimesPerCapita", DataKind.R4, 1), @@ -37,8 +34,9 @@ public static void RunExample() new TextLoader.Column("HighwayDistance", DataKind.R4, 9), new TextLoader.Column("TaxRate", DataKind.R4, 10), new TextLoader.Column("TeacherRatio", DataKind.R4, 11), - } - }); + }, + hasHeader: true + ); // Read the data var data = reader.Read(dataFile); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs index 0c95abacb8..7150f12835 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/PermutationFeatureImportance.cs @@ -22,11 +22,8 @@ public static void PFI_Regression() // First, we define the reader: specify the data columns and where to find them in the text file. // The data file is composed of rows of data, with each row having 11 numerical columns // separated by whitespace. - var reader = mlContext.Data.TextReader(new TextLoader.Arguments() - { - Separator = "tab", - HasHeader = true, - Column = new[] + var reader = mlContext.Data.CreateTextReader( + columns: new[] { // Read the first column (indexed by 0) in the data file as an R4 (float) new TextLoader.Column("MedianHomeValue", DataKind.R4, 0), @@ -40,9 +37,10 @@ public static void PFI_Regression() new TextLoader.Column("EmploymentDistance", DataKind.R4, 8), new TextLoader.Column("HighwayDistance", DataKind.R4, 9), new TextLoader.Column("TaxRate", DataKind.R4, 10), - new TextLoader.Column("TeacherRatio", DataKind.R4, 11), - } - }); + new TextLoader.Column("TeacherRatio", DataKind.R4, 11) + }, + hasHeader: true + ); // Read the data var data = reader.Read(dataFile); diff --git a/docs/samples/Microsoft.ML.Samples/Dynamic/SDCA.cs b/docs/samples/Microsoft.ML.Samples/Dynamic/SDCA.cs index a6c1904f6a..09dea18ff1 100644 --- a/docs/samples/Microsoft.ML.Samples/Dynamic/SDCA.cs +++ b/docs/samples/Microsoft.ML.Samples/Dynamic/SDCA.cs @@ -24,16 +24,14 @@ public static void SDCA_BinaryClassification() // Step 1: Read the data as an IDataView. // First, we define the reader: specify the data columns and where to find them in the text file. - var reader = mlContext.Data.TextReader(new TextLoader.Arguments() - { - Separator = "tab", - HasHeader = true, - Column = new[] + var reader = mlContext.Data.CreateTextReader( + columns: new[] { new TextLoader.Column("Sentiment", DataKind.BL, 0), new TextLoader.Column("SentimentText", DataKind.Text, 1) - } - }); + }, + hasHeader: true + ); // Read the data var data = reader.Read(dataFile); diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs index e748967714..a3d5260531 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs @@ -25,7 +25,6 @@ namespace Microsoft.ML.Runtime.Data { /// /// Loads a text file into an IDataView. Supports basic mapping from input columns to IDataView columns. - /// Should accept any file that TlcTextInstances accepts. /// public sealed partial class TextLoader : IDataReader, ICanSaveModel { @@ -1008,23 +1007,38 @@ private bool HasHeader private readonly IHost _host; private const string RegistrationName = "TextLoader"; - public TextLoader(IHostEnvironment env, Column[] columns, Action advancedSettings, IMultiStreamSource dataSample = null) - : this(env, MakeArgs(columns, advancedSettings), dataSample) + /// + /// Loads a text file into an . Supports basic mapping from input columns to IDataView columns. + /// + /// The environment to use. + /// Defines a mapping between input columns in the file and IDataView columns. + /// Whether the file has a header. + /// The character used as separator between data points in a row. By default the tab character is used as separator. + /// Allows to expose items that can be used for reading. + public TextLoader(IHostEnvironment env, Column[] columns, bool hasHeader = false, char separatorChar = '\t', IMultiStreamSource dataSample = null) + : this(env, MakeArgs(columns, hasHeader, new[] { separatorChar }), dataSample) { } - private static Arguments MakeArgs(Column[] columns, Action advancedSettings) + private static Arguments MakeArgs(Column[] columns, bool hasHeader, char[] separatorChars) { - var result = new Arguments { Column = columns }; - advancedSettings?.Invoke(result); + Contracts.AssertValue(separatorChars); + var result = new Arguments { Column = columns, HasHeader = hasHeader, SeparatorChars = separatorChars}; return result; } - public TextLoader(IHostEnvironment env, Arguments args, IMultiStreamSource dataSample = null) + /// + /// Loads a text file into an . Supports basic mapping from input columns to IDataView columns. + /// + /// The environment to use. + /// Defines the settings of the load operation. + /// Allows to expose items that can be used for reading. + public TextLoader(IHostEnvironment env, Arguments args = null, IMultiStreamSource dataSample = null) { + args = args ?? new Arguments(); + Contracts.CheckValue(env, nameof(env)); _host = env.Register(RegistrationName); - _host.CheckValue(args, nameof(args)); _host.CheckValueOrNull(dataSample); @@ -1285,7 +1299,7 @@ private TextLoader(IHost host, ModelLoadContext ctx) _parser = new Parser(this); } - public static TextLoader Create(IHostEnvironment env, ModelLoadContext ctx) + internal static TextLoader Create(IHostEnvironment env, ModelLoadContext ctx) { Contracts.CheckValue(env, nameof(env)); IHost h = env.Register(RegistrationName); @@ -1297,15 +1311,15 @@ public static TextLoader Create(IHostEnvironment env, ModelLoadContext ctx) } // These are legacy constructors needed for ComponentCatalog. - public static IDataLoader Create(IHostEnvironment env, ModelLoadContext ctx, IMultiStreamSource files) + internal static IDataLoader Create(IHostEnvironment env, ModelLoadContext ctx, IMultiStreamSource files) => (IDataLoader)Create(env, ctx).Read(files); - public static IDataLoader Create(IHostEnvironment env, Arguments args, IMultiStreamSource files) + internal static IDataLoader Create(IHostEnvironment env, Arguments args, IMultiStreamSource files) => (IDataLoader)new TextLoader(env, args, files).Read(files); /// /// Convenience method to create a and use it to read a specified file. /// - public static IDataView ReadFile(IHostEnvironment env, Arguments args, IMultiStreamSource fileSource) + internal static IDataView ReadFile(IHostEnvironment env, Arguments args, IMultiStreamSource fileSource) => new TextLoader(env, args, fileSource).Read(fileSource); public void Save(ModelSaveContext ctx) diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs index e5de3573ee..b4cf936a38 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs @@ -5,13 +5,8 @@ using Microsoft.ML.Runtime; using Microsoft.ML.Runtime.Data; using Microsoft.ML.Runtime.Data.IO; -using Microsoft.ML.Runtime.Internal.Utilities; -using Microsoft.ML.StaticPipe; using System; -using System.Collections.Generic; using System.IO; -using System.Linq; -using System.Text; using static Microsoft.ML.Runtime.Data.TextLoader; namespace Microsoft.ML @@ -19,36 +14,37 @@ namespace Microsoft.ML public static class TextLoaderSaverCatalog { /// - /// Create a text reader. + /// Create a text reader . /// /// The catalog. - /// The arguments to text reader, describing the data schema. + /// The columns of the schema. + /// Whether the file has a header. + /// The character used as separator between data points in a row. By default the tab character is used as separator. /// The optional location of a data sample. - public static TextLoader TextReader(this DataOperations catalog, - TextLoader.Arguments args, IMultiStreamSource dataSample = null) - => new TextLoader(CatalogUtils.GetEnvironment(catalog), args, dataSample); + public static TextLoader CreateTextReader(this DataOperations catalog, + Column[] columns, bool hasHeader = false, char separatorChar = '\t', IMultiStreamSource dataSample = null) + => new TextLoader(CatalogUtils.GetEnvironment(catalog), columns, hasHeader, separatorChar, dataSample); /// - /// Create a text reader. + /// Create a text reader . /// /// The catalog. - /// The columns of the schema. - /// The delegate to set additional settings. - /// The optional location of a data sample. - public static TextLoader TextReader(this DataOperations catalog, - TextLoader.Column[] columns, Action advancedSettings = null, IMultiStreamSource dataSample = null) - => new TextLoader(CatalogUtils.GetEnvironment(catalog), columns, advancedSettings, dataSample); + /// Defines the settings of the load operation. + /// Allows to expose items that can be used for reading. + public static TextLoader CreateTextReader(this DataOperations catalog, Arguments args, IMultiStreamSource dataSample = null) + => new TextLoader(CatalogUtils.GetEnvironment(catalog), args, dataSample); /// /// Read a data view from a text file using . /// /// The catalog. /// The columns of the schema. - /// The delegate to set additional settings - /// The path to the file + /// Whether the file has a header. + /// The character used as separator between data points in a row. By default the tab character is used as separator. + /// The path to the file. /// The data view. public static IDataView ReadFromTextFile(this DataOperations catalog, - TextLoader.Column[] columns, string path, Action advancedSettings = null) + string path, Column[] columns, bool hasHeader = false, char separatorChar = '\t') { Contracts.CheckNonEmpty(path, nameof(path)); @@ -56,10 +52,26 @@ public static IDataView ReadFromTextFile(this DataOperations catalog, // REVIEW: it is almost always a mistake to have a 'trainable' text loader here. // Therefore, we are going to disallow data sample. - var reader = new TextLoader(env, columns, advancedSettings, dataSample: null); + var reader = new TextLoader(env, columns, hasHeader, separatorChar, dataSample: null); return reader.Read(new MultiFileSource(path)); } + /// + /// Read a data view from a text file using . + /// + /// The catalog. + /// Specifies a file from which to read. + /// Defines the settings of the load operation. + public static IDataView ReadFromTextFile(this DataOperations catalog, string path, Arguments args = null) + { + Contracts.CheckNonEmpty(path, nameof(path)); + + var env = catalog.GetEnvironment(); + var source = new MultiFileSource(path); + + return new TextLoader(env, args, source).Read(source); + } + /// /// Save the data view as text. /// diff --git a/src/Microsoft.ML.Data/StaticPipe/DataLoadSaveOperationsExtensions.cs b/src/Microsoft.ML.Data/StaticPipe/DataLoadSaveOperationsExtensions.cs index 57adb1be4d..a5f0172935 100644 --- a/src/Microsoft.ML.Data/StaticPipe/DataLoadSaveOperationsExtensions.cs +++ b/src/Microsoft.ML.Data/StaticPipe/DataLoadSaveOperationsExtensions.cs @@ -4,14 +4,7 @@ using Microsoft.ML.Runtime; using Microsoft.ML.Runtime.Data; -using Microsoft.ML.Runtime.Data.IO; -using Microsoft.ML.Runtime.Internal.Utilities; -using Microsoft.ML.StaticPipe; using System; -using System.Collections.Generic; -using System.IO; -using System.Linq; -using System.Text; using static Microsoft.ML.Runtime.Data.TextLoader; namespace Microsoft.ML.StaticPipe @@ -40,10 +33,10 @@ public static class DataLoadSaveOperationsExtensions /// Whether the input may include sparse representations. /// Remove trailing whitespace from lines. /// A configured statically-typed reader for text files. - public static DataReader TextReader<[IsShape] TShape>( + public static DataReader CreateTextReader<[IsShape] TShape>( this DataOperations catalog, Func func, IMultiStreamSource files = null, bool hasHeader = false, char separator = '\t', bool allowQuoting = true, bool allowSparse = true, bool trimWhitspace = false) - => TextLoader.CreateReader(catalog.Environment, func, files, hasHeader, separator, allowQuoting, allowSparse, trimWhitspace); + => CreateReader(catalog.Environment, func, files, hasHeader, separator, allowQuoting, allowSparse, trimWhitspace); } } diff --git a/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingTransformer.cs b/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingTransformer.cs index 81d8e927b4..b33fe123fe 100644 --- a/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingTransformer.cs +++ b/src/Microsoft.ML.Data/Transforms/ValueToKeyMappingTransformer.cs @@ -483,13 +483,10 @@ private static TermMap CreateFileTermMap(IHostEnvironment env, IChannel ch, stri "{0} should not be specified when default loader is TextLoader. Ignoring {0}={1}", nameof(Arguments.TermsColumn), src); } - termData = TextLoader.ReadFile(env, - new TextLoader.Arguments() - { - Separator = "tab", - Column = new[] { new TextLoader.Column("Term", DataKind.TX, 0) } - }, - fileSource); + termData = new TextLoader(env, + columns: new[] { new TextLoader.Column("Term", DataKind.TX, 0) }, + dataSample: fileSource) + .Read(fileSource); src = "Term"; autoConvert = true; } diff --git a/src/Microsoft.ML.Data/Utilities/ModelFileUtils.cs b/src/Microsoft.ML.Data/Utilities/ModelFileUtils.cs index 97948b764b..5debae70b7 100644 --- a/src/Microsoft.ML.Data/Utilities/ModelFileUtils.cs +++ b/src/Microsoft.ML.Data/Utilities/ModelFileUtils.cs @@ -283,8 +283,8 @@ public static IEnumerable> LoadRoleMappingsOrNu { // REVIEW: Should really validate the schema here, and consider // ignoring this stream if it isn't as expected. - var loader = TextLoader.ReadFile(env, new TextLoader.Arguments(), - new RepositoryStreamWrapper(rep, DirTrainingInfo, RoleMappingFile)); + var repoStreamWrapper = new RepositoryStreamWrapper(rep, DirTrainingInfo, RoleMappingFile); + var loader = new TextLoader(env, dataSample: repoStreamWrapper).Read(repoStreamWrapper); using (var cursor = loader.GetRowCursor(c => true)) { diff --git a/src/Microsoft.ML.Transforms/TermLookupTransformer.cs b/src/Microsoft.ML.Transforms/TermLookupTransformer.cs index 83a36c55af..741b112645 100644 --- a/src/Microsoft.ML.Transforms/TermLookupTransformer.cs +++ b/src/Microsoft.ML.Transforms/TermLookupTransformer.cs @@ -349,27 +349,27 @@ private static IComponentFactory GetLoaderFacto // If the user specified non-key values, we define the value column to be numeric. if (!keyValues) return ComponentFactoryUtils.CreateFromFunction( - (env, files) => TextLoader.Create( - env, - new TextLoader.Arguments() - { - Column = new[] + (env, files) => new TextLoader( + env, new[] { new TextLoader.Column("Term", DataKind.TX, 0), new TextLoader.Column("Value", DataKind.Num, 1) - } - }, - files)); + }, dataSample: files).Read(files) as IDataLoader); // If the user specified key values, we scan the values to determine the range of the key type. ulong min = ulong.MaxValue; ulong max = ulong.MinValue; try { - var txtArgs = new TextLoader.Arguments(); - bool parsed = CmdParser.ParseArguments(host, "col=Term:TX:0 col=Value:TX:1", txtArgs); - host.Assert(parsed); - var data = TextLoader.ReadFile(host, txtArgs, new MultiFileSource(filename)); + var file = new MultiFileSource(filename); + var data = new TextLoader(host, new[] + { + new TextLoader.Column("Term", DataKind.TX, 0), + new TextLoader.Column("Value", DataKind.TX, 1) + }, + dataSample: file + ).Read(file); + using (var cursor = data.GetRowCursor(c => true)) { var getTerm = cursor.GetGetter>(0); @@ -444,17 +444,14 @@ private static IComponentFactory GetLoaderFacto } return ComponentFactoryUtils.CreateFromFunction( - (env, files) => TextLoader.Create( - env, - new TextLoader.Arguments() - { - Column = new[] - { - new TextLoader.Column("Term", DataKind.TX, 0), - valueColumn - } - }, - files)); + (env, files) => new TextLoader( + env, + columns: new[] + { + new TextLoader.Column("Term", DataKind.TX, 0), + valueColumn + }, + dataSample: files).Read(files) as IDataLoader); } // This saves the lookup data as a byte array encoded as a binary .idv file. diff --git a/src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs b/src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs index a4e0cee7e6..7cd22beb74 100644 --- a/src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs +++ b/src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs @@ -722,17 +722,13 @@ private IDataLoader GetLoaderForStopwords(IChannel ch, string dataFile, { if (stopwordsCol == null) stopwordsCol = "Stopwords"; - dataLoader = TextLoader.Create( + dataLoader = new TextLoader( Host, - new TextLoader.Arguments() + columns: new[] { - Separator = "tab", - Column = new[] - { - new TextLoader.Column(stopwordsCol, DataKind.TX, 0) - } + new TextLoader.Column(stopwordsCol, DataKind.TX, 0) }, - fileSource); + dataSample: fileSource).Read(fileSource) as IDataLoader; } ch.AssertNonEmpty(stopwordsCol); } diff --git a/test/Microsoft.ML.Benchmarks/KMeansAndLogisticRegressionBench.cs b/test/Microsoft.ML.Benchmarks/KMeansAndLogisticRegressionBench.cs index 990915128e..1adf7d4065 100644 --- a/test/Microsoft.ML.Benchmarks/KMeansAndLogisticRegressionBench.cs +++ b/test/Microsoft.ML.Benchmarks/KMeansAndLogisticRegressionBench.cs @@ -18,7 +18,7 @@ public ParameterMixingCalibratedPredictor TrainKMeansAndLR() var ml = new MLContext(seed: 1); // Pipeline - var input = ml.Data.ReadFromTextFile(new[] { + var input = ml.Data.ReadFromTextFile(_dataPath, new[] { new TextLoader.Column("Label", DataKind.R4, 0), new TextLoader.Column("CatFeatures", DataKind.TX, new [] { @@ -28,11 +28,7 @@ public ParameterMixingCalibratedPredictor TrainKMeansAndLR() new [] { new TextLoader.Range() { Min = 9, Max = 14 }, }), - }, _dataPath, s => - { - s.HasHeader = true; - s.Separator = "\t"; - }); + }, hasHeader: true); var estimatorPipeline = ml.Transforms.Categorical.OneHotEncoding("CatFeatures") .Append(ml.Transforms.Normalize("NumFeatures")) diff --git a/test/Microsoft.ML.Benchmarks/PredictionEngineBench.cs b/test/Microsoft.ML.Benchmarks/PredictionEngineBench.cs index 39342cf224..c711ab63f6 100644 --- a/test/Microsoft.ML.Benchmarks/PredictionEngineBench.cs +++ b/test/Microsoft.ML.Benchmarks/PredictionEngineBench.cs @@ -38,19 +38,16 @@ public void SetupIrisPipeline() var env = new MLContext(seed: 1, conc: 1); var reader = new TextLoader(env, - new TextLoader.Arguments() - { - Separator = "\t", - HasHeader = true, - Column = new[] + columns: new[] { new TextLoader.Column("Label", DataKind.R4, 0), new TextLoader.Column("SepalLength", DataKind.R4, 1), new TextLoader.Column("SepalWidth", DataKind.R4, 2), new TextLoader.Column("PetalLength", DataKind.R4, 3), new TextLoader.Column("PetalWidth", DataKind.R4, 4), - } - }); + }, + hasHeader: true + ); IDataView data = reader.Read(_irisDataPath); @@ -73,17 +70,13 @@ public void SetupSentimentPipeline() string _sentimentDataPath = Program.GetInvariantCultureDataPath("wikipedia-detox-250-line-data.tsv"); var env = new MLContext(seed: 1, conc: 1); - var reader = new TextLoader(env, - new TextLoader.Arguments() - { - Separator = "\t", - HasHeader = true, - Column = new[] + var reader = new TextLoader(env, columns: new[] { new TextLoader.Column("Label", DataKind.BL, 0), new TextLoader.Column("SentimentText", DataKind.Text, 1) - } - }); + }, + hasHeader: true + ); IDataView data = reader.Read(_sentimentDataPath); @@ -106,17 +99,13 @@ public void SetupBreastCancerPipeline() string _breastCancerDataPath = Program.GetInvariantCultureDataPath("breast-cancer.txt"); var env = new MLContext(seed: 1, conc: 1); - var reader = new TextLoader(env, - new TextLoader.Arguments() - { - Separator = "\t", - HasHeader = false, - Column = new[] + var reader = new TextLoader(env, columns: new[] { new TextLoader.Column("Label", DataKind.BL, 0), new TextLoader.Column("Features", DataKind.R4, new[] { new TextLoader.Range(1, 9) }) - } - }); + }, + hasHeader: false + ); IDataView data = reader.Read(_breastCancerDataPath); diff --git a/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs b/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs index c2fee89f24..bee16e0d7b 100644 --- a/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs +++ b/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs @@ -64,30 +64,29 @@ public void TrainSentiment() { var env = new MLContext(seed: 1); // Pipeline - var loader = TextLoader.ReadFile(env, - new TextLoader.Arguments() + var arguments = new TextLoader.Arguments() + { + Column = new TextLoader.Column[] { - AllowQuoting = false, - AllowSparse = false, - Separator = "tab", - HasHeader = true, - Column = new[] + new TextLoader.Column() { - new TextLoader.Column() - { - Name = "Label", - Source = new [] { new TextLoader.Range() { Min=0, Max=0} }, - Type = DataKind.Num - }, + Name = "Label", + Source = new[] { new TextLoader.Range() { Min = 0, Max = 0 } }, + Type = DataKind.Num + }, - new TextLoader.Column() - { - Name = "SentimentText", - Source = new [] { new TextLoader.Range() { Min=1, Max=1} }, - Type = DataKind.Text - } + new TextLoader.Column() + { + Name = "SentimentText", + Source = new[] { new TextLoader.Range() { Min = 1, Max = 1 } }, + Type = DataKind.Text } - }, new MultiFileSource(_sentimentDataPath)); + }, + HasHeader = true, + AllowQuoting = false, + AllowSparse = false + }; + var loader = env.Data.ReadFromTextFile(_sentimentDataPath, arguments); var text = TextFeaturizingEstimator.Create(env, new TextFeaturizingEstimator.Arguments() diff --git a/test/Microsoft.ML.Predictor.Tests/TestPredictors.cs b/test/Microsoft.ML.Predictor.Tests/TestPredictors.cs index f96f6f205a..94c7d98155 100644 --- a/test/Microsoft.ML.Predictor.Tests/TestPredictors.cs +++ b/test/Microsoft.ML.Predictor.Tests/TestPredictors.cs @@ -606,12 +606,12 @@ public void RankingLightGBMTest() public void TestTreeEnsembleCombiner() { var dataPath = GetDataPath("breast-cancer.txt"); - var dataView = TextLoader.Create(Env, new TextLoader.Arguments(), new MultiFileSource(dataPath)); + var dataView = ML.Data.ReadFromTextFile(dataPath); var fastTrees = new IPredictorModel[3]; for (int i = 0; i < 3; i++) { - fastTrees[i] = FastTree.TrainBinary(Env, new FastTreeBinaryClassificationTrainer.Arguments + fastTrees[i] = FastTree.TrainBinary(ML, new FastTreeBinaryClassificationTrainer.Arguments { FeatureColumn = "Features", NumTrees = 5, @@ -628,13 +628,13 @@ public void TestTreeEnsembleCombiner() public void TestTreeEnsembleCombinerWithCategoricalSplits() { var dataPath = GetDataPath("adult.tiny.with-schema.txt"); - var dataView = TextLoader.Create(Env, new TextLoader.Arguments(), new MultiFileSource(dataPath)); + var dataView = ML.Data.ReadFromTextFile(dataPath); - var cat = new OneHotEncodingEstimator(Env, "Categories", "Features").Fit(dataView).Transform(dataView); + var cat = new OneHotEncodingEstimator(ML, "Categories", "Features").Fit(dataView).Transform(dataView); var fastTrees = new IPredictorModel[3]; for (int i = 0; i < 3; i++) { - fastTrees[i] = FastTree.TrainBinary(Env, new FastTreeBinaryClassificationTrainer.Arguments + fastTrees[i] = FastTree.TrainBinary(ML, new FastTreeBinaryClassificationTrainer.Arguments { FeatureColumn = "Features", NumTrees = 5, @@ -729,11 +729,11 @@ private void CombineAndTestTreeEnsembles(IDataView idv, IPredictorModel[] fastTr public void TestEnsembleCombiner() { var dataPath = GetDataPath("breast-cancer.txt"); - var dataView = TextLoader.Create(Env, new TextLoader.Arguments(), new MultiFileSource(dataPath)); + var dataView = ML.Data.ReadFromTextFile(dataPath); var predictors = new IPredictorModel[] { - FastTree.TrainBinary(Env, new FastTreeBinaryClassificationTrainer.Arguments + FastTree.TrainBinary(ML, new FastTreeBinaryClassificationTrainer.Arguments { FeatureColumn = "Features", NumTrees = 5, @@ -741,7 +741,7 @@ public void TestEnsembleCombiner() LabelColumn = DefaultColumnNames.Label, TrainingData = dataView }).PredictorModel, - AveragedPerceptronTrainer.TrainBinary(Env, new AveragedPerceptronTrainer.Arguments() + AveragedPerceptronTrainer.TrainBinary(ML, new AveragedPerceptronTrainer.Arguments() { FeatureColumn = "Features", LabelColumn = DefaultColumnNames.Label, @@ -749,7 +749,7 @@ public void TestEnsembleCombiner() TrainingData = dataView, NormalizeFeatures = NormalizeOption.No }).PredictorModel, - LogisticRegression.TrainBinary(Env, new LogisticRegression.Arguments() + LogisticRegression.TrainBinary(ML, new LogisticRegression.Arguments() { FeatureColumn = "Features", LabelColumn = DefaultColumnNames.Label, @@ -757,7 +757,7 @@ public void TestEnsembleCombiner() TrainingData = dataView, NormalizeFeatures = NormalizeOption.No }).PredictorModel, - LogisticRegression.TrainBinary(Env, new LogisticRegression.Arguments() + LogisticRegression.TrainBinary(ML, new LogisticRegression.Arguments() { FeatureColumn = "Features", LabelColumn = DefaultColumnNames.Label, @@ -775,7 +775,7 @@ public void TestEnsembleCombiner() public void TestMultiClassEnsembleCombiner() { var dataPath = GetDataPath("breast-cancer.txt"); - var dataView = TextLoader.Create(Env, new TextLoader.Arguments(), new MultiFileSource(dataPath)); + var dataView = ML.Data.ReadFromTextFile(dataPath); var predictors = new IPredictorModel[] { diff --git a/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs b/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs index 35da6afbc0..0cff6c1794 100644 --- a/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs +++ b/test/Microsoft.ML.StaticPipelineTesting/StaticPipeTests.cs @@ -882,7 +882,7 @@ public void TestConvertStatic() + "1 1 2 4 15"; var dataSource = new BytesStreamSource(content); - var text = ml.Data.TextReader(ctx => ( + var text = ml.Data.CreateTextReader(ctx => ( label: ctx.LoadBool(0), text: ctx.LoadText(1), numericFeatures: ctx.LoadDouble(2, null)), // If fit correctly, this ought to be equivalent to max of 4, that is, length of 3. diff --git a/test/Microsoft.ML.StaticPipelineTesting/Training.cs b/test/Microsoft.ML.StaticPipelineTesting/Training.cs index 959f316111..ee5e2626cf 100644 --- a/test/Microsoft.ML.StaticPipelineTesting/Training.cs +++ b/test/Microsoft.ML.StaticPipelineTesting/Training.cs @@ -976,7 +976,7 @@ public void MatrixFactorization() // Read data file. The file contains 3 columns, label (float value), matrixColumnIndex (unsigned integer key), and matrixRowIndex (unsigned integer key). // More specifically, LoadKey(1, 0, 19) means that the matrixColumnIndex column is read from the 2nd (indexed by 1) column in the data file and as // a key type (stored as 32-bit unsigned integer) ranged from 0 to 19 (aka the training matrix has 20 columns). - var reader = mlContext.Data.TextReader(ctx => (label: ctx.LoadFloat(0), matrixColumnIndex: ctx.LoadKey(1, 0, 19), matrixRowIndex: ctx.LoadKey(2, 0, 39)), hasHeader: true); + var reader = mlContext.Data.CreateTextReader(ctx => (label: ctx.LoadFloat(0), matrixColumnIndex: ctx.LoadKey(1, 0, 19), matrixRowIndex: ctx.LoadKey(2, 0, 39)), hasHeader: true); // The parameter that will be into the onFit method below. The obtained predictor will be assigned to this variable // so that we will be able to touch it. diff --git a/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipeBase.cs b/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipeBase.cs index 16ceae22d3..9ff3432ca6 100644 --- a/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipeBase.cs +++ b/test/Microsoft.ML.TestFramework/DataPipe/TestDataPipeBase.cs @@ -438,7 +438,7 @@ protected bool SaveLoadText(IDataView view, IHostEnvironment env, // Note that we don't pass in "args", but pass in a default args so we test // the auto-schema parsing. - var loadedData = TextLoader.ReadFile(env, new TextLoader.Arguments(), new MultiFileSource(pathData)); + var loadedData = ML.Data.ReadFromTextFile(pathData); if (!CheckMetadataTypes(loadedData.Schema)) Failed(); diff --git a/test/Microsoft.ML.TestFramework/ModelHelper.cs b/test/Microsoft.ML.TestFramework/ModelHelper.cs index 316a9ea755..4692942e83 100644 --- a/test/Microsoft.ML.TestFramework/ModelHelper.cs +++ b/test/Microsoft.ML.TestFramework/ModelHelper.cs @@ -14,7 +14,7 @@ namespace Microsoft.ML.TestFramework #pragma warning disable 612, 618 public static class ModelHelper { - private static IHostEnvironment s_environment = new MLContext(seed: 1); + private static MLContext s_environment = new MLContext(seed: 1); private static ITransformModel s_housePriceModel; public static void WriteKcHousePriceModel(string dataPath, string outputModelPath) @@ -41,17 +41,34 @@ public static void WriteKcHousePriceModel(string dataPath, Stream stream) public static IDataView GetKcHouseDataView(string dataPath) { - var dataSchema = "col=Id:TX:0 col=Date:TX:1 col=Label:R4:2 col=Bedrooms:R4:3 " + - "col=Bathrooms:R4:4 col=SqftLiving:R4:5 col=SqftLot:R4:6 col=Floors:R4:7 " + - "col=Waterfront:R4:8 col=View:R4:9 col=Condition:R4:10 col=Grade:R4:11 " + - "col=SqftAbove:R4:12 col=SqftBasement:R4:13 col=YearBuilt:R4:14 " + - "col=YearRenovated:R4:15 col=Zipcode:R4:16 col=Lat:R4:17 col=Long:R4:18 " + - "col=SqftLiving15:R4:19 col=SqftLot15:R4:20 header+ sep=,"; - - var txtArgs = new Runtime.Data.TextLoader.Arguments(); - bool parsed = CmdParser.ParseArguments(s_environment, dataSchema, txtArgs); - s_environment.Assert(parsed); - return Runtime.Data.TextLoader.ReadFile(s_environment, txtArgs, new MultiFileSource(dataPath)); + return s_environment.Data.ReadFromTextFile(dataPath, + columns: new[] + { + new Runtime.Data.TextLoader.Column("Id", Runtime.Data.DataKind.TX, 0), + new Runtime.Data.TextLoader.Column("Date", Runtime.Data.DataKind.TX, 1), + new Runtime.Data.TextLoader.Column("Label", Runtime.Data.DataKind.R4, 2), + new Runtime.Data.TextLoader.Column("BedRooms", Runtime.Data.DataKind.R4, 3), + new Runtime.Data.TextLoader.Column("BathRooms", Runtime.Data.DataKind.R4, 4), + new Runtime.Data.TextLoader.Column("SqftLiving", Runtime.Data.DataKind.R4, 5), + new Runtime.Data.TextLoader.Column("SqftLot", Runtime.Data.DataKind.R4, 6), + new Runtime.Data.TextLoader.Column("Floors", Runtime.Data.DataKind.R4, 7), + new Runtime.Data.TextLoader.Column("WaterFront", Runtime.Data.DataKind.R4, 8), + new Runtime.Data.TextLoader.Column("View", Runtime.Data.DataKind.R4, 9), + new Runtime.Data.TextLoader.Column("Condition", Runtime.Data.DataKind.R4, 10), + new Runtime.Data.TextLoader.Column("Grade", Runtime.Data.DataKind.R4, 11), + new Runtime.Data.TextLoader.Column("SqftAbove", Runtime.Data.DataKind.R4, 12), + new Runtime.Data.TextLoader.Column("SqftBasement", Runtime.Data.DataKind.R4, 13), + new Runtime.Data.TextLoader.Column("YearBuilt", Runtime.Data.DataKind.R4, 14), + new Runtime.Data.TextLoader.Column("YearRenovated", Runtime.Data.DataKind.R4, 15), + new Runtime.Data.TextLoader.Column("Zipcode", Runtime.Data.DataKind.R4, 16), + new Runtime.Data.TextLoader.Column("Lat", Runtime.Data.DataKind.R4, 17), + new Runtime.Data.TextLoader.Column("Long", Runtime.Data.DataKind.R4, 18), + new Runtime.Data.TextLoader.Column("SqftLiving15", Runtime.Data.DataKind.R4, 19), + new Runtime.Data.TextLoader.Column("SqftLot15", Runtime.Data.DataKind.R4, 20) + }, + hasHeader: true, + separatorChar: ',' + ); } private static ITransformModel CreateKcHousePricePredictorModel(string dataPath) diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/ApiScenariosTests.cs b/test/Microsoft.ML.Tests/Scenarios/Api/ApiScenariosTests.cs index 3b9db811e5..6027be2805 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/ApiScenariosTests.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/ApiScenariosTests.cs @@ -52,34 +52,25 @@ public class SentimentPrediction public float Score; } - private static TextLoader.Arguments MakeIrisTextLoaderArgs() + private static TextLoader.Column[] MakeIrisColumns() { - return new TextLoader.Arguments() - { - Separator = "comma", - Column = new[] + return new[] { new TextLoader.Column("SepalLength", DataKind.R4, 0), new TextLoader.Column("SepalWidth", DataKind.R4, 1), new TextLoader.Column("PetalLength", DataKind.R4, 2), new TextLoader.Column("PetalWidth",DataKind.R4, 3), new TextLoader.Column("Label", DataKind.Text, 4) - } - }; + }; } - private static TextLoader.Arguments MakeSentimentTextLoaderArgs() + private static TextLoader.Column[] MakeSentimentColumns() { - return new TextLoader.Arguments() - { - Separator = "tab", - HasHeader = true, - Column = new[] + return new[] { new TextLoader.Column("Label", DataKind.BL, 0), new TextLoader.Column("SentimentText", DataKind.Text, 1) - } - }; + }; } } } diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs index 104855c2a9..66ba6ba137 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamples.cs @@ -43,7 +43,7 @@ private void IntermediateData(string dataPath) var mlContext = new MLContext(); // Create the reader: define the data columns and where to find them in the text file. - var reader = mlContext.Data.TextReader(ctx => ( + var reader = mlContext.Data.CreateTextReader(ctx => ( // A boolean column depicting the 'target label'. IsOver50K: ctx.LoadBool(0), // Three text columns. @@ -99,7 +99,7 @@ private void TrainRegression(string trainDataPath, string testDataPath, string m // Step one: read the data as an IDataView. // First, we define the reader: specify the data columns and where to find them in the text file. - var reader = mlContext.Data.TextReader(ctx => ( + var reader = mlContext.Data.CreateTextReader(ctx => ( // We read the first 11 values as a single float vector. FeatureVector: ctx.LoadFloat(0, 10), // Separately, read the target variable. @@ -178,7 +178,7 @@ private ITransformer TrainOnIris(string irisDataPath) // Step one: read the data as an IDataView. // First, we define the reader: specify the data columns and where to find them in the text file. - var reader = mlContext.Data.TextReader(ctx => ( + var reader = mlContext.Data.CreateTextReader(ctx => ( // The four features of the Iris dataset. SepalLength: ctx.LoadFloat(0), SepalWidth: ctx.LoadFloat(1), @@ -256,7 +256,7 @@ private void TrainAndInspectWeights(string dataPath) // Step one: read the data as an IDataView. // First, we define the reader: specify the data columns and where to find them in the text file. - var reader = mlContext.Data.TextReader(ctx => ( + var reader = mlContext.Data.CreateTextReader(ctx => ( // The four features of the Iris dataset. SepalLength: ctx.LoadFloat(0), SepalWidth: ctx.LoadFloat(1), @@ -328,7 +328,7 @@ private void NormalizationWorkout(string dataPath) var mlContext = new MLContext(); // Define the reader: specify the data columns and where to find them in the text file. - var reader = mlContext.Data.TextReader(ctx => ( + var reader = mlContext.Data.CreateTextReader(ctx => ( // The four features of the Iris dataset will be grouped together as one Features column. Features: ctx.LoadFloat(0, 3), // Label: kind of iris. @@ -444,7 +444,7 @@ private void TextFeaturizationOn(string dataPath) var mlContext = new MLContext(); // Define the reader: specify the data columns and where to find them in the text file. - var reader = mlContext.Data.TextReader(ctx => ( + var reader = mlContext.Data.CreateTextReader(ctx => ( IsToxic: ctx.LoadBool(0), Message: ctx.LoadText(1) ), hasHeader: true); @@ -506,7 +506,7 @@ private void CategoricalFeaturizationOn(params string[] dataPath) var mlContext = new MLContext(); // Define the reader: specify the data columns and where to find them in the text file. - var reader = mlContext.Data.TextReader(ctx => ( + var reader = mlContext.Data.CreateTextReader(ctx => ( Label: ctx.LoadBool(0), // We will load all the categorical features into one vector column of size 8. CategoricalFeatures: ctx.LoadText(1, 8), @@ -573,7 +573,7 @@ private void CrossValidationOn(string dataPath) // Step one: read the data as an IDataView. // First, we define the reader: specify the data columns and where to find them in the text file. - var reader = mlContext.Data.TextReader(ctx => ( + var reader = mlContext.Data.CreateTextReader(ctx => ( // The four features of the Iris dataset. SepalLength: ctx.LoadFloat(0), SepalWidth: ctx.LoadFloat(1), @@ -633,7 +633,7 @@ private void MixMatch(string dataPath) // Read the data as an IDataView. // First, we define the reader: specify the data columns and where to find them in the text file. - var reader = mlContext.Data.TextReader(ctx => ( + var reader = mlContext.Data.CreateTextReader(ctx => ( // The four features of the Iris dataset. SepalLength: ctx.LoadFloat(0), SepalWidth: ctx.LoadFloat(1), diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs index 1c14a5b158..706b4aad7d 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs @@ -41,9 +41,7 @@ private void IntermediateData(string dataPath) var mlContext = new MLContext(); // Create the reader: define the data columns and where to find them in the text file. - var reader = mlContext.Data.TextReader(new TextLoader.Arguments - { - Column = new[] { + var reader = mlContext.Data.CreateTextReader(new[] { // A boolean column depicting the 'label'. new TextLoader.Column("IsOver50K", DataKind.BL, 0), // Three text columns. @@ -52,8 +50,8 @@ private void IntermediateData(string dataPath) new TextLoader.Column("MaritalStatus", DataKind.TX, 3) }, // First line of the file is a header, not a data row. - HasHeader = true - }); + hasHeader: true + ); // Start creating our processing pipeline. For now, let's just concatenate all the text columns // together into one. @@ -93,9 +91,7 @@ private void TrainRegression(string trainDataPath, string testDataPath, string m // Step one: read the data as an IDataView. // First, we define the reader: specify the data columns and where to find them in the text file. - var reader = mlContext.Data.TextReader(new TextLoader.Arguments - { - Column = new[] { + var reader = mlContext.Data.CreateTextReader(new[] { // We read the first 11 values as a single float vector. new TextLoader.Column("FeatureVector", DataKind.R4, 0, 10), @@ -103,10 +99,10 @@ private void TrainRegression(string trainDataPath, string testDataPath, string m new TextLoader.Column("Target", DataKind.R4, 11), }, // First line of the file is a header, not a data row. - HasHeader = true, + hasHeader: true, // Default separator is tab, but we need a semicolon. - Separator = ";" - }); + separatorChar: ';' + ); // Now read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed). var trainData = reader.Read(trainDataPath); @@ -171,9 +167,7 @@ private ITransformer TrainOnIris(string irisDataPath) // Step one: read the data as an IDataView. // First, we define the reader: specify the data columns and where to find them in the text file. - var reader = mlContext.Data.TextReader(new TextLoader.Arguments - { - Column = new[] { + var reader = mlContext.Data.CreateTextReader(new[] { new TextLoader.Column("SepalLength", DataKind.R4, 0), new TextLoader.Column("SepalWidth", DataKind.R4, 1), new TextLoader.Column("PetalLength", DataKind.R4, 2), @@ -182,8 +176,8 @@ private ITransformer TrainOnIris(string irisDataPath) new TextLoader.Column("Label", DataKind.TX, 4), }, // Default separator is tab, but the dataset has comma. - Separator = "," - }); + separatorChar: ',' + ); // Retrieve the training data. var trainData = reader.Read(irisDataPath); @@ -240,17 +234,15 @@ private void NormalizationWorkout(string dataPath) var mlContext = new MLContext(); // Define the reader: specify the data columns and where to find them in the text file. - var reader = mlContext.Data.TextReader(new TextLoader.Arguments - { - Column = new[] { + var reader = mlContext.Data.CreateTextReader(new[] { // The four features of the Iris dataset will be grouped together as one Features column. new TextLoader.Column("Features", DataKind.R4, 0, 3), // Label: kind of iris. new TextLoader.Column("Label", DataKind.TX, 4), }, // Default separator is tab, but the dataset has comma. - Separator = "," - }); + separatorChar: ',' + ); // Read the training data. var trainData = reader.Read(dataPath); @@ -303,14 +295,13 @@ private void TextFeaturizationOn(string dataPath) var mlContext = new MLContext(); // Define the reader: specify the data columns and where to find them in the text file. - var reader = mlContext.Data.TextReader(new TextLoader.Arguments - { - Column = new[] { + var reader = mlContext.Data.CreateTextReader(new[] + { new TextLoader.Column("IsToxic", DataKind.BL, 0), new TextLoader.Column("Message", DataKind.TX, 1), }, - HasHeader = true - }); + hasHeader: true + ); // Read the data. var data = reader.Read(dataPath); @@ -371,9 +362,8 @@ private void CategoricalFeaturizationOn(params string[] dataPath) var mlContext = new MLContext(); // Define the reader: specify the data columns and where to find them in the text file. - var reader = mlContext.Data.TextReader(new TextLoader.Arguments - { - Column = new[] { + var reader = mlContext.Data.CreateTextReader(new[] + { new TextLoader.Column("Label", DataKind.BL, 0), // We will load all the categorical features into one vector column of size 8. new TextLoader.Column("CategoricalFeatures", DataKind.TX, 1, 8), @@ -382,8 +372,8 @@ private void CategoricalFeaturizationOn(params string[] dataPath) // Let's also separately load the 'Workclass' column. new TextLoader.Column("Workclass", DataKind.TX, 1), }, - HasHeader = true - }); + hasHeader: true + ); // Read the data. var data = reader.Read(dataPath); @@ -436,9 +426,8 @@ private void CrossValidationOn(string dataPath) // Step one: read the data as an IDataView. // First, we define the reader: specify the data columns and where to find them in the text file. - var reader = mlContext.Data.TextReader(new TextLoader.Arguments - { - Column = new[] { + var reader = mlContext.Data.CreateTextReader(new[] + { // We read the first 11 values as a single float vector. new TextLoader.Column("SepalLength", DataKind.R4, 0), new TextLoader.Column("SepalWidth", DataKind.R4, 1), @@ -448,8 +437,8 @@ private void CrossValidationOn(string dataPath) new TextLoader.Column("Label", DataKind.TX, 4), }, // Default separator is tab, but the dataset has comma. - Separator = "," - }); + separatorChar: ',' + ); // Read the data. var data = reader.Read(dataPath); @@ -498,14 +487,14 @@ private void ReadDataDynamic(string dataPath) var mlContext = new MLContext(); // Create the reader: define the data columns and where to find them in the text file. - var reader = mlContext.Data.TextReader(new[] { + var reader = mlContext.Data.CreateTextReader(new[] { // We read the first 10 values as a single float vector. new TextLoader.Column("FeatureVector", DataKind.R4, new[] {new TextLoader.Range(0, 9)}), // Separately, read the target variable. new TextLoader.Column("Target", DataKind.R4, 10) }, // Default separator is tab, but we need a comma. - s => s.Separator = ","); + separatorChar: ',' ); // Now read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed). var data = reader.Read(dataPath); @@ -527,11 +516,11 @@ public class OutputRow public void CustomTransformer() { var mlContext = new MLContext(); - var data = mlContext.Data.ReadFromTextFile(new[] + var data = mlContext.Data.ReadFromTextFile(GetDataPath("adult.tiny.with-schema.txt"), new[] { new TextLoader.Column("Income", DataKind.R4, 10), new TextLoader.Column("Features", DataKind.R4, 12, 14) - }, GetDataPath("adult.tiny.with-schema.txt"), s => { s.Separator = "\t"; s.HasHeader = true; }); + }, hasHeader: true); PrepareData(mlContext, data); TrainModel(mlContext, data); diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/CrossValidation.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/CrossValidation.cs index 1eeaf9c13c..3c5cabb3dc 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/CrossValidation.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/CrossValidation.cs @@ -27,7 +27,7 @@ void New_CrossValidation() { var ml = new MLContext(seed: 1, conc: 1); - var data = ml.Data.TextReader(MakeSentimentTextLoaderArgs()).Read(GetDataPath(TestDatasets.Sentiment.trainFilename)); + var data = ml.Data.CreateTextReader(MakeSentimentColumns(), hasHeader: true).Read(GetDataPath(TestDatasets.Sentiment.trainFilename)); // Pipeline. var pipeline = ml.Transforms.Text.FeaturizeText("SentimentText", "Features") .Append(ml.BinaryClassification.Trainers.StochasticDualCoordinateAscent("Label", "Features", advancedSettings: (s) => { s.ConvergenceTolerance = 1f; s.NumThreads = 1; })); diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/DecomposableTrainAndPredict.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/DecomposableTrainAndPredict.cs index 536a74283f..8694b5b199 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/DecomposableTrainAndPredict.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/DecomposableTrainAndPredict.cs @@ -30,7 +30,7 @@ void New_DecomposableTrainAndPredict() var dataPath = GetDataPath(TestDatasets.irisData.trainFilename); var ml = new MLContext(); - var data = ml.Data.TextReader(MakeIrisTextLoaderArgs()) + var data = ml.Data.CreateTextReader(MakeIrisColumns(), separatorChar: ',') .Read(dataPath); var pipeline = new ColumnConcatenatingEstimator (ml, "Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth") @@ -41,7 +41,7 @@ void New_DecomposableTrainAndPredict() var model = pipeline.Fit(data).GetModelFor(TransformerScope.Scoring); var engine = model.MakePredictionFunction(ml); - var testLoader = TextLoader.ReadFile(ml, MakeIrisTextLoaderArgs(), new MultiFileSource(dataPath)); + var testLoader = ml.Data.ReadFromTextFile(dataPath, MakeIrisColumns(), separatorChar: ','); var testData = testLoader.AsEnumerable(ml, false); foreach (var input in testData.Take(20)) { diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Evaluation.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Evaluation.cs index ad249fa6e9..b09dbbe82d 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Evaluation.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Evaluation.cs @@ -22,7 +22,7 @@ public void New_Evaluation() var ml = new MLContext(seed: 1, conc: 1); // Pipeline. - var pipeline = ml.Data.TextReader(MakeSentimentTextLoaderArgs()) + var pipeline = ml.Data.CreateTextReader(MakeSentimentColumns(), hasHeader: true) .Append(ml.Transforms.Text.FeaturizeText("SentimentText", "Features")) .Append(ml.BinaryClassification.Trainers.StochasticDualCoordinateAscent("Label", "Features", advancedSettings: s => s.NumThreads = 1)); diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Extensibility.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Extensibility.cs index 2c750a7b81..392bf95b85 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Extensibility.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Extensibility.cs @@ -29,7 +29,7 @@ void New_Extensibility() var dataPath = GetDataPath(TestDatasets.irisData.trainFilename); var ml = new MLContext(); - var data = ml.Data.TextReader(MakeIrisTextLoaderArgs()) + var data = ml.Data.CreateTextReader(MakeIrisColumns(), separatorChar: ',') .Read(dataPath); Action action = (i, j) => @@ -49,7 +49,7 @@ void New_Extensibility() var model = pipeline.Fit(data).GetModelFor(TransformerScope.Scoring); var engine = model.MakePredictionFunction(ml); - var testLoader = TextLoader.ReadFile(ml, MakeIrisTextLoaderArgs(), new MultiFileSource(dataPath)); + var testLoader = ml.Data.ReadFromTextFile(dataPath, MakeIrisColumns(), separatorChar: ','); var testData = testLoader.AsEnumerable(ml, false); foreach (var input in testData.Take(20)) { diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/FileBasedSavingOfData.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/FileBasedSavingOfData.cs index 667581c9b3..b85f5e646b 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/FileBasedSavingOfData.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/FileBasedSavingOfData.cs @@ -27,7 +27,7 @@ void New_FileBasedSavingOfData() var ml = new MLContext(seed: 1, conc: 1); var src = new MultiFileSource(GetDataPath(TestDatasets.Sentiment.trainFilename)); - var trainData = ml.Data.TextReader(MakeSentimentTextLoaderArgs()) + var trainData = ml.Data.CreateTextReader(MakeSentimentColumns(), hasHeader: true) .Append(ml.Transforms.Text.FeaturizeText("SentimentText", "Features")) .Fit(src).Read(src); diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/IntrospectiveTraining.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/IntrospectiveTraining.cs index daf2777047..6cf2898db4 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/IntrospectiveTraining.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/IntrospectiveTraining.cs @@ -34,7 +34,7 @@ public partial class ApiScenariosTests public void New_IntrospectiveTraining() { var ml = new MLContext(seed: 1, conc: 1); - var data = ml.Data.TextReader(MakeSentimentTextLoaderArgs()) + var data = ml.Data.CreateTextReader(MakeSentimentColumns(), hasHeader: true) .Read(GetDataPath(TestDatasets.Sentiment.trainFilename)); var pipeline = ml.Transforms.Text.FeaturizeText("SentimentText", "Features") diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Metacomponents.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Metacomponents.cs index 182451a30f..032ef53787 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Metacomponents.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Metacomponents.cs @@ -26,7 +26,7 @@ public partial class ApiScenariosTests public void New_Metacomponents() { var ml = new MLContext(); - var data = ml.Data.TextReader(MakeIrisTextLoaderArgs()) + var data = ml.Data.CreateTextReader(MakeIrisColumns(), separatorChar: ',') .Read(GetDataPath(TestDatasets.irisData.trainFilename)); var sdcaTrainer = ml.BinaryClassification.Trainers.StochasticDualCoordinateAscent("Label", "Features", advancedSettings: (s) => { s.MaxIterations = 100; s.Shuffle = true; s.NumThreads = 1; }); diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/MultithreadedPrediction.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/MultithreadedPrediction.cs index f2285b5200..b5ba493b2d 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/MultithreadedPrediction.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/MultithreadedPrediction.cs @@ -26,7 +26,7 @@ public partial class ApiScenariosTests void New_MultithreadedPrediction() { var ml = new MLContext(seed: 1, conc: 1); - var reader = ml.Data.TextReader(MakeSentimentTextLoaderArgs()); + var reader = ml.Data.CreateTextReader(MakeSentimentColumns(), hasHeader: true); var data = reader.Read(new MultiFileSource(GetDataPath(TestDatasets.Sentiment.trainFilename))); // Pipeline. diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/ReconfigurablePrediction.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/ReconfigurablePrediction.cs index ffda5b1ce1..52a9266f3a 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/ReconfigurablePrediction.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/ReconfigurablePrediction.cs @@ -22,7 +22,7 @@ public partial class ApiScenariosTests public void New_ReconfigurablePrediction() { var ml = new MLContext(seed: 1, conc: 1); - var dataReader = ml.Data.TextReader(MakeSentimentTextLoaderArgs()); + var dataReader = ml.Data.CreateTextReader(MakeSentimentColumns(), hasHeader: true); var data = dataReader.Read(GetDataPath(TestDatasets.Sentiment.trainFilename)); var testData = dataReader.Read(GetDataPath(TestDatasets.Sentiment.testFilename)); diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/SimpleTrainAndPredict.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/SimpleTrainAndPredict.cs index 016acd6220..22ec24c29a 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/SimpleTrainAndPredict.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/SimpleTrainAndPredict.cs @@ -22,7 +22,7 @@ public partial class ApiScenariosTests public void New_SimpleTrainAndPredict() { var ml = new MLContext(seed: 1, conc: 1); - var reader = ml.Data.TextReader(MakeSentimentTextLoaderArgs()); + var reader = ml.Data.CreateTextReader(MakeSentimentColumns(), hasHeader: true); var data = reader.Read(GetDataPath(TestDatasets.Sentiment.trainFilename)); // Pipeline. var pipeline = ml.Transforms.Text.FeaturizeText("SentimentText", "Features") diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/TrainSaveModelAndPredict.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/TrainSaveModelAndPredict.cs index adab64dec1..59bd307dbd 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/TrainSaveModelAndPredict.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/TrainSaveModelAndPredict.cs @@ -26,7 +26,7 @@ public partial class ApiScenariosTests public void New_TrainSaveModelAndPredict() { var ml = new MLContext(seed: 1, conc: 1); - var reader = ml.Data.TextReader(MakeSentimentTextLoaderArgs()); + var reader = ml.Data.CreateTextReader(MakeSentimentColumns(), hasHeader: true); var data = reader.Read(GetDataPath(TestDatasets.Sentiment.trainFilename)); // Pipeline. diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/TrainWithInitialPredictor.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/TrainWithInitialPredictor.cs index a117de429c..6c47365926 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/TrainWithInitialPredictor.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/TrainWithInitialPredictor.cs @@ -22,7 +22,7 @@ public void New_TrainWithInitialPredictor() var ml = new MLContext(seed: 1, conc: 1); - var data = ml.Data.TextReader(MakeSentimentTextLoaderArgs()).Read(GetDataPath(TestDatasets.Sentiment.trainFilename)); + var data = ml.Data.CreateTextReader(MakeSentimentColumns(), hasHeader: true).Read(GetDataPath(TestDatasets.Sentiment.trainFilename)); // Pipeline. var pipeline = ml.Transforms.Text.FeaturizeText("SentimentText", "Features"); diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/TrainWithValidationSet.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/TrainWithValidationSet.cs index 2a7030ea94..bda23779c4 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/TrainWithValidationSet.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/TrainWithValidationSet.cs @@ -20,7 +20,7 @@ public void New_TrainWithValidationSet() { var ml = new MLContext(seed: 1, conc: 1); // Pipeline. - var reader = ml.Data.TextReader(MakeSentimentTextLoaderArgs()); + var reader = ml.Data.CreateTextReader(MakeSentimentColumns(), hasHeader: true); var pipeline = ml.Transforms.Text.FeaturizeText("SentimentText", "Features"); // Train the pipeline, prepare train and validation set. diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Visibility.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Visibility.cs index c91e2498a5..ebab7c3a3b 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Visibility.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Visibility.cs @@ -26,7 +26,7 @@ public partial class ApiScenariosTests void New_Visibility() { var ml = new MLContext(seed: 1, conc: 1); - var pipeline = ml.Data.TextReader(MakeSentimentTextLoaderArgs()) + var pipeline = ml.Data.CreateTextReader(MakeSentimentColumns(), hasHeader: true) .Append(ml.Transforms.Text.FeaturizeText("SentimentText", "Features", s => s.OutputTokens = true)); var src = new MultiFileSource(GetDataPath(TestDatasets.Sentiment.trainFilename)); diff --git a/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/IrisPlantClassificationTests.cs b/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/IrisPlantClassificationTests.cs index 4752119236..1de3d82ace 100644 --- a/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/IrisPlantClassificationTests.cs +++ b/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/IrisPlantClassificationTests.cs @@ -26,10 +26,7 @@ public void TrainAndPredictIrisModelUsingDirectInstantiationTest() { var mlContext = new MLContext(seed: 1, conc: 1); - var reader = mlContext.Data.TextReader(new TextLoader.Arguments() - { - HasHeader = false, - Column = new[] + var reader = mlContext.Data.CreateTextReader(columns: new[] { new TextLoader.Column("Label", DataKind.R4, 0), new TextLoader.Column("SepalLength", DataKind.R4, 1), @@ -37,7 +34,7 @@ public void TrainAndPredictIrisModelUsingDirectInstantiationTest() new TextLoader.Column("PetalLength", DataKind.R4, 3), new TextLoader.Column("PetalWidth", DataKind.R4, 4) } - }); + ); var pipe = mlContext.Transforms.Concatenate("Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth") .Append(mlContext.Transforms.Normalize("Features")) diff --git a/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/SentimentPredictionTests.cs b/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/SentimentPredictionTests.cs index b04a360168..ead36a22a5 100644 --- a/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/SentimentPredictionTests.cs +++ b/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/SentimentPredictionTests.cs @@ -25,17 +25,14 @@ public void TrainAndPredictSentimentModelWithDirectionInstantiationTest() var env = new MLContext(seed: 1, conc: 1); // Pipeline - var loader = TextLoader.ReadFile(env, - new TextLoader.Arguments() + var loader = env.Data.ReadFromTextFile(dataPath, + columns: new[] { - Separator = "tab", - HasHeader = true, - Column = new[] - { - new TextLoader.Column("Label", DataKind.Num, 0), - new TextLoader.Column("SentimentText", DataKind.Text, 1) - } - }, new MultiFileSource(dataPath)); + new TextLoader.Column("Label", DataKind.Num, 0), + new TextLoader.Column("SentimentText", DataKind.Text, 1) + }, + hasHeader: true + ); var trans = TextFeaturizingEstimator.Create(env, new TextFeaturizingEstimator.Arguments() { @@ -86,17 +83,14 @@ public void TrainAndPredictSentimentModelWithDirectionInstantiationTestWithWordE var env = new MLContext(seed: 1, conc: 1); // Pipeline - var loader = TextLoader.ReadFile(env, - new TextLoader.Arguments() + var loader = env.Data.ReadFromTextFile(dataPath, + columns: new[] { - Separator = "tab", - HasHeader = true, - Column = new[] - { - new TextLoader.Column("Label", DataKind.Num, 0), - new TextLoader.Column("SentimentText", DataKind.Text, 1) - } - }, new MultiFileSource(dataPath)); + new TextLoader.Column("Label", DataKind.Num, 0), + new TextLoader.Column("SentimentText", DataKind.Text, 1) + }, + hasHeader: true + ); var text = TextFeaturizingEstimator.Create(env, new TextFeaturizingEstimator.Arguments() { diff --git a/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs b/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs index 10d0db46a9..130d232e95 100644 --- a/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs +++ b/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs @@ -257,18 +257,15 @@ public void TensorFlowInputsOutputsSchemaTest() public void TensorFlowTransformMNISTConvTest() { var mlContext = new MLContext(seed: 1, conc: 1); - var reader = mlContext.Data.TextReader( - new TextLoader.Arguments() - { - Separator = "tab", - HasHeader = true, - Column = new[] + var reader = mlContext.Data.CreateTextReader( + columns: new[] { new TextLoader.Column("Label", DataKind.U4 , new [] { new TextLoader.Range(0) }, new KeyRange(0, 9)), new TextLoader.Column("Placeholder", DataKind.R4, new []{ new TextLoader.Range(1, 784) }) - } - }); + }, + hasHeader: true + ); var trainData = reader.Read(GetDataPath(TestDatasets.mnistTiny28.trainFilename)); var testData = reader.Read(GetDataPath(TestDatasets.mnistOneClass.testFilename)); @@ -303,17 +300,12 @@ public void TensorFlowTransformMNISTLRTrainingTest() try { var mlContext = new MLContext(seed: 1, conc: 1); - var reader = mlContext.Data.TextReader( - new TextLoader.Arguments - { - Separator = "tab", - HasHeader = false, - Column = new[] + var reader = mlContext.Data.CreateTextReader(columns: new[] { new TextLoader.Column("Label", DataKind.I8, 0), new TextLoader.Column("Placeholder", DataKind.R4, new []{ new TextLoader.Range(1, 784) }) } - }); + ); var trainData = reader.Read(GetDataPath(TestDatasets.mnistTiny28.trainFilename)); var testData = reader.Read(GetDataPath(TestDatasets.mnistOneClass.testFilename)); @@ -398,17 +390,13 @@ private void ExecuteTFTransformMNISTConvTrainingTest(bool shuffle, int? shuffleS { var mlContext = new MLContext(seed: 1, conc: 1); - var reader = mlContext.Data.TextReader(new TextLoader.Arguments - { - Separator = "tab", - HasHeader = false, - Column = new[] + var reader = mlContext.Data.CreateTextReader(new[] { new TextLoader.Column("Label", DataKind.U4, new []{ new TextLoader.Range(0) }, new KeyRange(0, 9)), new TextLoader.Column("TfLabel", DataKind.I8, 0), new TextLoader.Column("Placeholder", DataKind.R4, new []{ new TextLoader.Range(1, 784) }) } - }); + ); var trainData = reader.Read(GetDataPath(TestDatasets.mnistTiny28.trainFilename)); var testData = reader.Read(GetDataPath(TestDatasets.mnistOneClass.testFilename)); @@ -491,16 +479,13 @@ public void TensorFlowTransformMNISTConvSavedModelTest() // of predicted label of a single in-memory example. var mlContext = new MLContext(seed: 1, conc: 1); - var reader = mlContext.Data.TextReader(new TextLoader.Arguments - { - Separator = "tab", - HasHeader = true, - Column = new[] + var reader = mlContext.Data.CreateTextReader(columns: new[] { new TextLoader.Column("Label", DataKind.U4 , new [] { new TextLoader.Range(0) }, new KeyRange(0, 9)), new TextLoader.Column("Placeholder", DataKind.R4, new []{ new TextLoader.Range(1, 784) }) - } - }); + }, + hasHeader: true + ); var trainData = reader.Read(GetDataPath(TestDatasets.mnistTiny28.trainFilename)); var testData = reader.Read(GetDataPath(TestDatasets.mnistOneClass.testFilename)); @@ -625,14 +610,13 @@ public void TensorFlowTransformCifar() var dataFile = GetDataPath("images/images.tsv"); var imageFolder = Path.GetDirectoryName(dataFile); - var data = TextLoader.Create(mlContext, new TextLoader.Arguments() - { - Column = new[] - { + var data = mlContext.Data.ReadFromTextFile(dataFile, + columns: new[] + { new TextLoader.Column("ImagePath", DataKind.TX, 0), new TextLoader.Column("Name", DataKind.TX, 1), - } - }, new MultiFileSource(dataFile)); + } + ); var pipeEstimator = new ImageLoadingEstimator(mlContext, imageFolder, ("ImagePath", "ImageReal")) .Append(new ImageResizingEstimator(mlContext, "ImageReal", "ImageCropped", imageWidth, imageHeight)) @@ -673,14 +657,12 @@ public void TensorFlowTransformCifarSavedModel() var dataFile = GetDataPath("images/images.tsv"); var imageFolder = Path.GetDirectoryName(dataFile); - var data = TextLoader.Create(mlContext, new TextLoader.Arguments() - { - Column = new[] + var data = mlContext.Data.ReadFromTextFile(dataFile, columns: new[] { new TextLoader.Column("ImagePath", DataKind.TX, 0), new TextLoader.Column("Name", DataKind.TX, 1), - } - }, new MultiFileSource(dataFile)); + } + ); var images = ImageLoaderTransform.Create(mlContext, new ImageLoaderTransform.Arguments() { Column = new ImageLoaderTransform.Column[1] @@ -732,14 +714,13 @@ public void TensorFlowTransformCifarInvalidShape() var imageWidth = 28; var dataFile = GetDataPath("images/images.tsv"); var imageFolder = Path.GetDirectoryName(dataFile); - var data = TextLoader.Create(mlContext, new TextLoader.Arguments() - { - Column = new[] + var data = mlContext.Data.ReadFromTextFile(dataFile, + columns: new[] { new TextLoader.Column("ImagePath", DataKind.TX, 0), new TextLoader.Column("Name", DataKind.TX, 1), - } - }, new MultiFileSource(dataFile)); + } + ); var images = ImageLoaderTransform.Create(mlContext, new ImageLoaderTransform.Arguments() { Column = new ImageLoaderTransform.Column[1] diff --git a/test/Microsoft.ML.Tests/Transformers/CustomMappingTests.cs b/test/Microsoft.ML.Tests/Transformers/CustomMappingTests.cs index 840676b1d5..11f9a440ee 100644 --- a/test/Microsoft.ML.Tests/Transformers/CustomMappingTests.cs +++ b/test/Microsoft.ML.Tests/Transformers/CustomMappingTests.cs @@ -56,10 +56,10 @@ public void TestCustomTransformer() { string dataPath = GetDataPath("adult.tiny.with-schema.txt"); var source = new MultiFileSource(dataPath); - var loader = ML.Data.TextReader(new[] { + var loader = ML.Data.CreateTextReader(new[] { new TextLoader.Column("Float1", DataKind.R4, 9), new TextLoader.Column("Float4", DataKind.R4, new[]{new TextLoader.Range(9), new TextLoader.Range(10), new TextLoader.Range(11), new TextLoader.Range(12) }) - }, s => { s.Separator = "\t"; s.HasHeader = true; }); + }, hasHeader: true); var data = loader.Read(source); @@ -95,11 +95,11 @@ public void TestSchemaPropagation() { string dataPath = GetDataPath("adult.test"); var source = new MultiFileSource(dataPath); - var loader = ML.Data.TextReader(new[] { + var loader = ML.Data.CreateTextReader(new[] { new TextLoader.Column("Float1", DataKind.R4, 0), new TextLoader.Column("Float4", DataKind.R4, new[]{new TextLoader.Range(0), new TextLoader.Range(2), new TextLoader.Range(4), new TextLoader.Range(10) }), new TextLoader.Column("Text1", DataKind.Text, 0) - }, s => { s.Separator = ","; s.HasHeader = true; }); + }, hasHeader: true, separatorChar: ',' ); var data = loader.Read(source);