diff --git a/docs/code/MlNetCookBook.md b/docs/code/MlNetCookBook.md index 45feb339a4..908010c721 100644 --- a/docs/code/MlNetCookBook.md +++ b/docs/code/MlNetCookBook.md @@ -131,6 +131,44 @@ var reader = mlContext.Data.CreateTextReader(new[] { var data = reader.Read(dataPath); ``` +You can also create a data model class, and read the data based on this type. + +```csharp +// The data model. This type will be used through the document. +private class InspectedRow +{ + [LoadColumn(0)] + public bool IsOver50K { get; set; } + + [LoadColumn(1)] + public string Workclass { get; set; } + + [LoadColumn(2)] + public string Education { get; set; } + + [LoadColumn(3)] + public string MaritalStatus { get; set; } + + public string[] AllFeatures { get; set; } +} + +private class InspectedRowWithAllFeatures : InspectedRow +{ + public string[] AllFeatures { get; set; } +} + +// Create a new context for ML.NET operations. It can be used for exception tracking and logging, +// as a catalog of available operations and as the source of randomness. +var mlContext = new MLContext(); + +// Read the data into a data view. +var data = mlContext.Data.ReadFromTextFile(dataPath, + // First line of the file is a header, not a data row. + hasHeader: true +) + +``` + ## How do I load data from multiple files? You can again use the `TextLoader`, and specify an array of files to its Read method. @@ -214,7 +252,7 @@ var reader = mlContext.Data.CreateTextReader(ctx => ( Target: ctx.LoadFloat(11) ), // Default separator is tab, but we need a comma. - separator: ','); + separatorChar: ','); // Now read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed). @@ -231,17 +269,41 @@ var mlContext = new MLContext(); // Create the reader: define the data columns and where to find them in the text file. var reader = mlContext.Data.CreateTextReader(new[] { // We read the first 10 values as a single float vector. - new TextLoader.Column("FeatureVector", DataKind.R4, new[] {new TextLoader.Range(0, 9)}), + new TextLoader.Column("FeatureVector", DataKind.R4, new[] {new TextLoader.Range(0, 10)}), // Separately, read the target variable. - new TextLoader.Column("Target", DataKind.R4, 10) + new TextLoader.Column("Target", DataKind.R4, 11) }, // Default separator is tab, but we need a comma. - s => s.Separator = ","); + separatorChar: ','); // Now read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed). var data = reader.Read(dataPath); ``` +Or by creating a data model for it: + +```csharp +private class AdultData +{ + [LoadColumn("0", "10"), ColumnName("Features")] + public float FeatureVector { get; } + + [LoadColumn(11)] + public float Target { get; } +} + +// Create a new context for ML.NET operations. It can be used for exception tracking and logging, +// as a catalog of available operations and as the source of randomness. +var mlContext = new MLContext(); + +// Read the data into a data view. +var data = mlContext.Data.ReadFromTextFile(dataPath, + // First line of the file is a header, not a data row. + separatorChar: ',' +); + +``` + ## How do I debug my experiment or preview my pipeline? Most ML.NET operations are 'lazy': they are not actually processing data, they just validate that the operation is possible, and then defer execution until the output data is actually requested. This provides good efficiency, but makes it hard to step through and debug the experiment. @@ -325,7 +387,7 @@ var transformedData = dataPipeline.Fit(data).Transform(data); // 'transformedData' is a 'promise' of data. Let's actually read it. var someRows = transformedData.AsDynamic // Convert to an enumerable of user-defined type. - .AsEnumerable(mlContext, reuseRowObject: false) + .AsEnumerable(mlContext, reuseRowObject: false) // Take a couple values as an array. .Take(4).ToArray(); @@ -342,33 +404,14 @@ var sameFeatureColumns = dynamicData.GetColumn(mlContext, "AllFeatures .Take(20).ToArray(); ``` -The above code assumes that we defined our `InspectedRow` class as follows: -```csharp -private class InspectedRow -{ - public bool IsOver50K; - public string Workclass; - public string Education; - public string MaritalStatus; - public string[] AllFeatures; -} -``` - You can also use the dynamic API to create the equivalent of the previous pipeline. ```csharp // Create a new context for ML.NET operations. It can be used for exception tracking and logging, // as a catalog of available operations and as the source of randomness. var mlContext = new MLContext(); -// Create the reader: define the data columns and where to find them in the text file. -var reader = mlContext.Data.CreateTextReader(new[] { - // A boolean column depicting the 'label'. - new TextLoader.Column("IsOver50K", DataKind.BL, 0), - // Three text columns. - new TextLoader.Column("Workclass", DataKind.TX, 1), - new TextLoader.Column("Education", DataKind.TX, 2), - new TextLoader.Column("MaritalStatus", DataKind.TX, 3) - }, +// Read the data into a data view. +var data = mlContext.Data.ReadFromTextFile(dataPath, // First line of the file is a header, not a data row. hasHeader: true ); @@ -377,17 +420,13 @@ var reader = mlContext.Data.CreateTextReader(new[] { // together into one. var dynamicPipeline = mlContext.Transforms.Concatenate("AllFeatures", "Education", "MaritalStatus"); -// Let's verify that the data has been read correctly. -// First, we read the data file. -var data = reader.Read(dataPath); - // Fit our data pipeline and transform data with it. var transformedData = dynamicPipeline.Fit(data).Transform(data); // 'transformedData' is a 'promise' of data. Let's actually read it. var someRows = transformedData // Convert to an enumerable of user-defined type. - .AsEnumerable(mlContext, reuseRowObject: false) + .AsEnumerable(mlContext, reuseRowObject: false) // Take a couple values as an array. .Take(4).ToArray(); @@ -431,7 +470,7 @@ var reader = mlContext.Data.CreateTextReader(ctx => ( // The data file has header. hasHeader: true, // Default separator is tab, but we need a semicolon. - separator: ';'); + separatorChar: ';'); // Now read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed). @@ -476,22 +515,12 @@ var mlContext = new MLContext(); // Step one: read the data as an IDataView. // First, we define the reader: specify the data columns and where to find them in the text file. -var reader = mlContext.Data.CreateTextReader(new[] { - // We read the first 11 values as a single float vector. - new TextLoader.Column("FeatureVector", DataKind.R4, 0, 10), - - // Separately, read the target variable. - new TextLoader.Column("Target", DataKind.R4, 11), - }, +// Read the data into a data view. Remember though, readers are lazy, so the actual reading will happen when the data is accessed. +var trainData = mlContext.Data.ReadFromTextFile(dataPath, // First line of the file is a header, not a data row. - hasHeader: true, - // Default separator is tab, but we need a semicolon. - separatorChar: ';' + separatorChar: ',' ); -// Now read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed). -var trainData = reader.Read(trainDataPath); - // Sometime, caching data in-memory after its first access can save some loading time when the data is going to be used // several times somewhere. The caching mechanism is also lazy; it only caches things after being used. // User can replace all the subsequently uses of "trainData" with "cachedTrainData". We still use "trainData" because @@ -537,7 +566,10 @@ var metrics = mlContext.Regression.Evaluate(model.Transform(testData), label: r Calculating the metrics with the dynamic API is as follows. ```csharp // Read the test dataset. -var testData = reader.Read(testDataPath); +var testData = mlContext.Data.ReadFromTextFile(testDataPath, + // First line of the file is a header, not a data row. + separatorChar: ',' +); // Calculate metrics of the model on the test data. var metrics = mlContext.Regression.Evaluate(model.Transform(testData), label: "Target"); ``` @@ -605,7 +637,7 @@ var reader = mlContext.Data.CreateTextReader(ctx => ( Label: ctx.LoadText(4) ), // Default separator is tab, but the dataset has comma. - separator: ','); + separatorChar: ','); // Retrieve the training data. var trainData = reader.Read(irisDataPath); @@ -644,29 +676,19 @@ You can also use the dynamic API to create the equivalent of the previous pipeli var mlContext = new MLContext(); // Step one: read the data as an IDataView. -// First, we define the reader: specify the data columns and where to find them in the text file. -var reader = mlContext.Data.CreateTextReader(new[] { - new TextLoader.Column("SepalLength", DataKind.R4, 0), - new TextLoader.Column("SepalWidth", DataKind.R4, 1), - new TextLoader.Column("PetalLength", DataKind.R4, 2), - new TextLoader.Column("PetalWidth", DataKind.R4, 3), - // Label: kind of iris. - new TextLoader.Column("Label", DataKind.TX, 4), - }, + // Retrieve the training data. +var trainData = mlContext.Data.ReadFromTextFile(irisDataPath, // Default separator is tab, but the dataset has comma. separatorChar: ',' ); -// Retrieve the training data. -var trainData = reader.Read(irisDataPath); - // Build the training pipeline. var dynamicPipeline = // Concatenate all the features together into one column 'Features'. mlContext.Transforms.Concatenate("Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth") // Note that the label is text, so it needs to be converted to key. .Append(mlContext.Transforms.Categorical.MapValueToKey("Label"), TransformerScope.TrainTest) - // Cache data in moemory for steps after the cache check point stage. + // Cache data in memory for steps after the cache check point stage. .AppendCacheCheckpoint(mlContext) // Use the multi-class SDCA model to predict the label using features. .Append(mlContext.MulticlassClassification.Trainers.StochasticDualCoordinateAscent()) @@ -821,7 +843,7 @@ var reader = mlContext.Data.CreateTextReader(ctx => ( Label: ctx.LoadText(4) ), // Default separator is tab, but the dataset has comma. - separator: ','); + separatorChar: ','); // Retrieve the training data. var trainData = reader.Read(dataPath); @@ -914,7 +936,7 @@ var reader = mlContext.Data.CreateTextReader(ctx => ( Label: ctx.LoadText(4) ), // Default separator is tab, but the dataset has comma. - separator: ','); + separatorChar: ','); // Read the training data. var trainData = reader.Read(dataPath); @@ -937,24 +959,27 @@ var meanVarValues = normalizedData.GetColumn(r => r.MeanVarNormalized).ToArray() You can achieve the same results using the dynamic API. ```csharp +//data model for the Iris class +private class IrisInputAllFeatures +{ + // Unfortunately, we still need the dummy 'Label' column to be present. + [ColumnName("Label"), LoadColumn(4)] + public string IgnoredLabel { get; set; } + + [LoadColumn(4, loadAllOthers:true)] + public float Features { get; set; } +} + // Create a new context for ML.NET operations. It can be used for exception tracking and logging, // as a catalog of available operations and as the source of randomness. var mlContext = new MLContext(); -// Define the reader: specify the data columns and where to find them in the text file. -var reader = mlContext.Data.CreateTextReader(new[] { - // The four features of the Iris dataset will be grouped together as one Features column. - new TextLoader.Column("Features", DataKind.R4, 0, 3), - // Label: kind of iris. - new TextLoader.Column("Label", DataKind.TX, 4), - }, +// Read the training data. +var trainData = mlContext.Data.ReadFromTextFile(dataPath, // Default separator is tab, but the dataset has comma. separatorChar: ',' ); -// Read the training data. -var trainData = reader.Read(dataPath); - // Apply all kinds of standard ML.NET normalization to the raw features. var pipeline = mlContext.Transforms.Normalize( @@ -1270,7 +1295,7 @@ var reader = mlContext.Data.CreateTextReader(ctx => ( Label: ctx.LoadText(4) ), // Default separator is tab, but the dataset has comma. - separator: ','); + separatorChar: ','); // Read the data. var data = reader.Read(dataPath); @@ -1315,24 +1340,11 @@ You can achieve the same results using the dynamic API. var mlContext = new MLContext(); // Step one: read the data as an IDataView. -// First, we define the reader: specify the data columns and where to find them in the text file. -var reader = mlContext.Data.CreateTextReader(new[] - { - // We read the first 11 values as a single float vector. - new TextLoader.Column("SepalLength", DataKind.R4, 0), - new TextLoader.Column("SepalWidth", DataKind.R4, 1), - new TextLoader.Column("PetalLength", DataKind.R4, 2), - new TextLoader.Column("PetalWidth", DataKind.R4, 3), - // Label: kind of iris. - new TextLoader.Column("Label", DataKind.TX, 4), - }, +var data = mlContext.Data.ReadFromTextFile(dataPath, // Default separator is tab, but the dataset has comma. separatorChar: ',' ); -// Read the data. -var data = reader.Read(dataPath); - // Build the training pipeline. var dynamicPipeline = // Concatenate all the features together into one column 'Features'. @@ -1390,7 +1402,7 @@ var reader = mlContext.Data.CreateTextReader(ctx => ( Label: ctx.LoadText(4) ), // Default separator is tab, but the dataset has comma. - separator: ','); + separatorChar: ','); // Read the data. var data = reader.Read(dataPath); diff --git a/src/Microsoft.ML.Data/Data/SchemaDefinition.cs b/src/Microsoft.ML.Data/Data/SchemaDefinition.cs index d0f72b42a4..437d37d071 100644 --- a/src/Microsoft.ML.Data/Data/SchemaDefinition.cs +++ b/src/Microsoft.ML.Data/Data/SchemaDefinition.cs @@ -73,25 +73,12 @@ public sealed class ColumnAttribute : Attribute public ColumnAttribute(string ordinal, string name = null) { Name = name; - Ordinal = ordinal; } /// /// Column name. /// public string Name { get; } - - /// - /// Contains positions of indices of source columns in the form - /// of ranges. Examples of range: if we want to include just column - /// with index 1 we can write the range as 1, if we want to include - /// columns 1 to 10 then we can write the range as 1-10 and we want to include all the - /// columns from column with index 1 until end then we can write 1-*. - /// - /// This takes sequence of ranges that are comma seperated, example: - /// 1,2-5,10-* - /// - public string Ordinal { get; } } /// diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/LoadColumnAttribute.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/LoadColumnAttribute.cs new file mode 100644 index 0000000000..fcf0cbae3f --- /dev/null +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/LoadColumnAttribute.cs @@ -0,0 +1,73 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. +// See the LICENSE file in the project root for more information. + +using Microsoft.ML.Runtime.Data; +using System; +using System.Collections.Generic; + +namespace Microsoft.ML.Data +{ +// REVIEW: The Start field is decorated with [Obsolete], and this warning disables using Obsolete for this class. +// The Start field should get deleted together with the Legacy API. +#pragma warning disable 618 + /// + /// Describes column information such as name and the source columns indices that this + /// column encapsulates. + /// + [AttributeUsage(AttributeTargets.Field | AttributeTargets.Property, AllowMultiple = false, Inherited = true)] + public sealed class LoadColumnAttribute : Attribute + { + /// + /// Initializes new instance of . + /// + /// The index of the column in the text file. + // REVIEW: Remove calling the private constructor with just the start parameter, + // when the Legacy API's TextLoader gets deleted, and with it the Start field here. + public LoadColumnAttribute(int columnIndex) + : this(columnIndex.ToString()) + { + Sources.Add(new TextLoader.Range(columnIndex)); + } + + /// + /// Initializes new instance of . + /// + /// The starting column index, for the range. + /// The ending column index, for the range. + // REVIEW: Calling the private constructor with just the start parameter, is incorrect, + // but it is just temporary there, until the Legacy API's TextLoader gets deleted, together with the Start field. + public LoadColumnAttribute(int start, int end) + : this(start.ToString()) + { + Sources.Add(new TextLoader.Range(start, end)); + } + + /// + /// Initializes new instance of . + /// + /// Distinct text file column indices to load as part of this column. + // REVIEW: Calling the private constructor with just the columnIndexes[0] parameter, is incorrect, + // but it is just temporary there, until the Legacy API's TextLoader gets deleted together with the Start field. + public LoadColumnAttribute(int[] columnIndexes) + : this(columnIndexes[0].ToString()) // REVIEW: this is incorrect, but it is just temporary there, until the Legacy API's TextLoader gets deleted. + { + foreach (var col in columnIndexes) + Sources.Add(new TextLoader.Range(col)); + } + + [Obsolete("Should be deleted together with the Legacy project.")] + private LoadColumnAttribute(string start) + { + Sources = new List(); + Start = start; + } + + internal List Sources; + + [Obsolete("Should be deleted together with the Legacy project.")] + [BestFriend] + internal string Start { get; } + } +#pragma warning restore 618 +} diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs index 7e8a90b75e..f47e2e8118 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs @@ -12,6 +12,7 @@ using System; using System.Collections.Generic; using System.Linq; +using System.Reflection; using System.Text; using Float = System.Single; @@ -343,10 +344,10 @@ public class ArgumentsCore " missing value and an empty value is denoted by \"\". When false, consecutive separators" + " denote an empty value.", ShortName = "quote")] - public bool AllowQuoting = true; + public bool AllowQuoting = DefaultArguments.AllowQuoting; [Argument(ArgumentType.AtMostOnce, HelpText = "Whether the input may include sparse representations", ShortName = "sparse")] - public bool AllowSparse = true; + public bool AllowSparse = DefaultArguments.AllowSparse; [Argument(ArgumentType.AtMostOnce, HelpText = "Number of source columns in the text data. Default is that sparse rows contain their size information.", @@ -354,17 +355,17 @@ public class ArgumentsCore public int? InputSize; [Argument(ArgumentType.AtMostOnce, Visibility = ArgumentAttribute.VisibilityType.CmdLineOnly, HelpText = "Source column separator. Options: tab, space, comma, single character", ShortName = "sep")] - public string Separator = "tab"; + public string Separator = DefaultArguments.Separator.ToString(); [Argument(ArgumentType.AtMostOnce, Name = nameof(Separator), Visibility = ArgumentAttribute.VisibilityType.EntryPointsOnly, HelpText = "Source column separator.", ShortName = "sep")] - public char[] SeparatorChars = new[] { '\t' }; + public char[] SeparatorChars = new[] { DefaultArguments.Separator }; [Argument(ArgumentType.Multiple, HelpText = "Column groups. Each group is specified as name:type:numeric-ranges, eg, col=Features:R4:1-17,26,35-40", ShortName = "col", SortOrder = 1)] public Column[] Column; [Argument(ArgumentType.AtMostOnce, HelpText = "Remove trailing whitespace from lines", ShortName = "trim")] - public bool TrimWhitespace; + public bool TrimWhitespace = DefaultArguments.TrimWhitespace; [Argument(ArgumentType.AtMostOnce, ShortName = "header", HelpText = "Data file has header with feature names. Header is read only if options 'hs' and 'hf' are not specified.")] @@ -392,6 +393,15 @@ public sealed class Arguments : ArgumentsCore public long? MaxRows; } + internal static class DefaultArguments + { + internal const bool AllowQuoting = true; + internal const bool AllowSparse = true; + internal const char Separator = '\t'; + internal const bool HasHeader = false; + internal const bool TrimWhitespace = false; + } + /// /// Used as an input column range. /// A variable length segment (extending to the end of the input line) is represented by Lim == SrcLim. @@ -1352,6 +1362,75 @@ public void Save(ModelSaveContext ctx) public IDataView Read(string path) => Read(new MultiFileSource(path)); + internal static TextLoader CreateTextReader(IHostEnvironment host, + bool hasHeader = DefaultArguments.HasHeader, + char separator = DefaultArguments.Separator, + bool allowQuotedStrings = DefaultArguments.AllowQuoting, + bool supportSparse = DefaultArguments.AllowSparse, + bool trimWhitespace = DefaultArguments.TrimWhitespace) + { + var userType = typeof(TInput); + + var fieldInfos = userType.GetFields(BindingFlags.Public | BindingFlags.Instance); + + var propertyInfos = + userType + .GetProperties(BindingFlags.Public | BindingFlags.Instance) + .Where(x => x.CanRead && x.CanWrite && x.GetGetMethod() != null && x.GetSetMethod() != null && x.GetIndexParameters().Length == 0); + + var memberInfos = (fieldInfos as IEnumerable).Concat(propertyInfos).ToArray(); + + var columns = new List(); + + for (int index = 0; index < memberInfos.Length; index++) + { + var memberInfo = memberInfos[index]; + var mappingAttr = memberInfo.GetCustomAttribute(); + + host.Assert(mappingAttr != null, $"Field or property {memberInfo.Name} is missing the {nameof(LoadColumnAttribute)} attribute"); + + var mappingAttrName = memberInfo.GetCustomAttribute(); + + var column = new Column(); + column.Name = mappingAttrName?.Name ?? memberInfo.Name; + column.Source = mappingAttr.Sources.ToArray(); + DataKind dk; + switch (memberInfo) + { + case FieldInfo field: + if (!DataKindExtensions.TryGetDataKind(field.FieldType.IsArray ? field.FieldType.GetElementType() : field.FieldType, out dk)) + throw Contracts.Except($"Field {memberInfo.Name} is of unsupported type."); + + break; + + case PropertyInfo property: + if (!DataKindExtensions.TryGetDataKind(property.PropertyType.IsArray ? property.PropertyType.GetElementType() : property.PropertyType, out dk)) + throw Contracts.Except($"Property {memberInfo.Name} is of unsupported type."); + break; + + default: + Contracts.Assert(false); + throw Contracts.ExceptNotSupp("Expected a FieldInfo or a PropertyInfo"); + } + + column.Type = dk; + + columns.Add(column); + } + + Arguments args = new Arguments + { + HasHeader = hasHeader, + SeparatorChars = new[] { separator }, + AllowQuoting = allowQuotedStrings, + AllowSparse = supportSparse, + TrimWhitespace = trimWhitespace, + Column = columns.ToArray() + }; + + return new TextLoader(host, args); + } + private sealed class BoundLoader : IDataLoader { private readonly TextLoader _reader; diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs index b4cf936a38..0c8fc1b574 100644 --- a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs +++ b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs @@ -5,9 +5,7 @@ using Microsoft.ML.Runtime; using Microsoft.ML.Runtime.Data; using Microsoft.ML.Runtime.Data.IO; -using System; using System.IO; -using static Microsoft.ML.Runtime.Data.TextLoader; namespace Microsoft.ML { @@ -16,35 +14,66 @@ public static class TextLoaderSaverCatalog /// /// Create a text reader . /// - /// The catalog. + /// The catalog. /// The columns of the schema. /// Whether the file has a header. /// The character used as separator between data points in a row. By default the tab character is used as separator. /// The optional location of a data sample. public static TextLoader CreateTextReader(this DataOperations catalog, - Column[] columns, bool hasHeader = false, char separatorChar = '\t', IMultiStreamSource dataSample = null) + TextLoader.Column[] columns, + bool hasHeader = TextLoader.DefaultArguments.HasHeader, + char separatorChar = TextLoader.DefaultArguments.Separator, + IMultiStreamSource dataSample = null) => new TextLoader(CatalogUtils.GetEnvironment(catalog), columns, hasHeader, separatorChar, dataSample); /// /// Create a text reader . /// - /// The catalog. + /// The catalog. /// Defines the settings of the load operation. /// Allows to expose items that can be used for reading. - public static TextLoader CreateTextReader(this DataOperations catalog, Arguments args, IMultiStreamSource dataSample = null) + public static TextLoader CreateTextReader(this DataOperations catalog, + TextLoader.Arguments args, + IMultiStreamSource dataSample = null) => new TextLoader(CatalogUtils.GetEnvironment(catalog), args, dataSample); + /// + /// Create a text reader by inferencing the dataset schema from a data model type. + /// + /// The catalog. + /// Does the file contains header? + /// Column separator character. Default is '\t' + /// Whether the input may include quoted values, + /// which can contain separator characters, colons, + /// and distinguish empty values from missing values. When true, consecutive separators + /// denote a missing value and an empty value is denoted by \"\". + /// When false, consecutive separators denote an empty value. + /// Whether the input may include sparse representations for example, + /// if one of the row contains "5 2:6 4:3" that's mean there are 5 columns all zero + /// except for 3rd and 5th columns which have values 6 and 3 + /// Remove trailing whitespace from lines + public static TextLoader CreateTextReader(this DataOperations catalog, + bool hasHeader = TextLoader.DefaultArguments.HasHeader, + char separatorChar = TextLoader.DefaultArguments.Separator, + bool allowQuotedStrings = TextLoader.DefaultArguments.AllowQuoting, + bool supportSparse = TextLoader.DefaultArguments.AllowSparse, + bool trimWhitespace = TextLoader.DefaultArguments.TrimWhitespace) + => TextLoader.CreateTextReader(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar, allowQuotedStrings, supportSparse, trimWhitespace); + /// /// Read a data view from a text file using . /// - /// The catalog. + /// The catalog. /// The columns of the schema. /// Whether the file has a header. /// The character used as separator between data points in a row. By default the tab character is used as separator. /// The path to the file. /// The data view. public static IDataView ReadFromTextFile(this DataOperations catalog, - string path, Column[] columns, bool hasHeader = false, char separatorChar = '\t') + string path, + TextLoader.Column[] columns, + bool hasHeader = TextLoader.DefaultArguments.HasHeader, + char separatorChar = TextLoader.DefaultArguments.Separator) { Contracts.CheckNonEmpty(path, nameof(path)); @@ -59,10 +88,43 @@ public static IDataView ReadFromTextFile(this DataOperations catalog, /// /// Read a data view from a text file using . /// - /// The catalog. + /// The catalog. + /// Does the file contains header? + /// Column separator character. Default is '\t' + /// Whether the input may include quoted values, + /// which can contain separator characters, colons, + /// and distinguish empty values from missing values. When true, consecutive separators + /// denote a missing value and an empty value is denoted by \"\". + /// When false, consecutive separators denote an empty value. + /// Whether the input may include sparse representations for example, + /// if one of the row contains "5 2:6 4:3" that's mean there are 5 columns all zero + /// except for 3rd and 5th columns which have values 6 and 3 + /// Remove trailing whitespace from lines + /// The path to the file. + /// The data view. + public static IDataView ReadFromTextFile(this DataOperations catalog, + string path, + bool hasHeader = TextLoader.DefaultArguments.HasHeader, + char separatorChar = TextLoader.DefaultArguments.Separator, + bool allowQuotedStrings = TextLoader.DefaultArguments.AllowQuoting, + bool supportSparse = TextLoader.DefaultArguments.AllowSparse, + bool trimWhitespace = TextLoader.DefaultArguments.TrimWhitespace) + { + Contracts.CheckNonEmpty(path, nameof(path)); + + // REVIEW: it is almost always a mistake to have a 'trainable' text loader here. + // Therefore, we are going to disallow data sample. + return TextLoader.CreateTextReader(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar, allowQuotedStrings, supportSparse, trimWhitespace) + .Read(new MultiFileSource(path)); + } + + /// + /// Read a data view from a text file using . + /// + /// The catalog. /// Specifies a file from which to read. /// Defines the settings of the load operation. - public static IDataView ReadFromTextFile(this DataOperations catalog, string path, Arguments args = null) + public static IDataView ReadFromTextFile(this DataOperations catalog, string path, TextLoader.Arguments args = null) { Contracts.CheckNonEmpty(path, nameof(path)); @@ -75,22 +137,27 @@ public static IDataView ReadFromTextFile(this DataOperations catalog, string pat /// /// Save the data view as text. /// - /// The catalog. + /// The catalog. /// The data view to save. /// The stream to write to. - /// The column separator. + /// The column separator. /// Whether to write the header row. /// Whether to write the header comment with the schema. /// Whether to keep hidden columns in the dataset. - public static void SaveAsText(this DataOperations catalog, IDataView data, Stream stream, - char separator = '\t', bool headerRow = true, bool schema = true, bool keepHidden = false) + public static void SaveAsText(this DataOperations catalog, + IDataView data, + Stream stream, + char separatorChar = TextLoader.DefaultArguments.Separator, + bool headerRow = TextLoader.DefaultArguments.HasHeader, + bool schema = true, + bool keepHidden = false) { Contracts.CheckValue(catalog, nameof(catalog)); Contracts.CheckValue(data, nameof(data)); Contracts.CheckValue(stream, nameof(stream)); var env = catalog.GetEnvironment(); - var saver = new TextSaver(env, new TextSaver.Arguments { Separator = separator.ToString(), OutputHeader = headerRow, OutputSchema = schema }); + var saver = new TextSaver(env, new TextSaver.Arguments { Separator = separatorChar.ToString(), OutputHeader = headerRow, OutputSchema = schema }); using (var ch = env.Start("Saving data")) DataSaverUtils.SaveDataView(ch, saver, data, stream, keepHidden); diff --git a/src/Microsoft.ML.Legacy/Data/TextLoader.cs b/src/Microsoft.ML.Legacy/Data/TextLoader.cs index 5c0b185ed1..6e6dab405d 100644 --- a/src/Microsoft.ML.Legacy/Data/TextLoader.cs +++ b/src/Microsoft.ML.Legacy/Data/TextLoader.cs @@ -86,21 +86,21 @@ public TextLoader CreateFrom(bool useHeader = false, for (int index = 0; index < memberInfos.Length; index++) { var memberInfo = memberInfos[index]; - var mappingAttr = memberInfo.GetCustomAttribute(); + var mappingAttr = memberInfo.GetCustomAttribute(); if (mappingAttr == null) - throw Contracts.Except($"Field or property {memberInfo.Name} is missing ColumnAttribute"); - - if (Regex.Match(mappingAttr.Ordinal, @"[^(0-9,\*\-~)]+").Success) - throw Contracts.Except($"{mappingAttr.Ordinal} contains invalid characters. " + + throw Contracts.Except($"Field or property {memberInfo.Name} is missing LoadColumnAttributeAttribute"); +#pragma warning disable 618 + if (Regex.Match(mappingAttr.Start, @"[^(0-9,\*\-~)]+").Success) + throw Contracts.Except($"{mappingAttr.Start} contains invalid characters. " + $"Valid characters are 0-9, *, - and ~"); var mappingNameAttr = memberInfo.GetCustomAttribute(); - var name = mappingAttr.Name ?? mappingNameAttr?.Name ?? memberInfo.Name; + var name = mappingNameAttr?.Name ?? memberInfo.Name; Runtime.Data.TextLoader.Range[] sources; - if (!Runtime.Data.TextLoader.Column.TryParseSourceEx(mappingAttr.Ordinal, out sources)) - throw Contracts.Except($"{mappingAttr.Ordinal} could not be parsed."); - + if (!Runtime.Data.TextLoader.Column.TryParseSourceEx(mappingAttr.Start, out sources)) + throw Contracts.Except($"{mappingAttr.Start} could not be parsed."); +#pragma warning restore 618 Contracts.Assert(sources != null); TextLoaderColumn tlc = new TextLoaderColumn(); diff --git a/test/Microsoft.ML.Benchmarks/PredictionEngineBench.cs b/test/Microsoft.ML.Benchmarks/PredictionEngineBench.cs index 6e0e28fbab..f38946e4e0 100644 --- a/test/Microsoft.ML.Benchmarks/PredictionEngineBench.cs +++ b/test/Microsoft.ML.Benchmarks/PredictionEngineBench.cs @@ -146,10 +146,10 @@ public void MakeBreastCancerPredictions() public class SentimentData { - [ColumnName("Label"), Column("0")] + [ColumnName("Label"), LoadColumn(0)] public bool Sentiment; - [Column("1")] + [LoadColumn(1)] public string SentimentText; } diff --git a/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs b/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs index 3ff43d0c21..930c38af10 100644 --- a/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs +++ b/test/Microsoft.ML.Benchmarks/StochasticDualCoordinateAscentClassifierBench.cs @@ -167,19 +167,19 @@ private void Consume(IEnumerable predictions) public class IrisData { - [Column("0")] + [LoadColumn(0)] public float Label; - [Column("1")] + [LoadColumn(1)] public float SepalLength; - [Column("2")] + [LoadColumn(2)] public float SepalWidth; - [Column("3")] + [LoadColumn(3)] public float PetalLength; - [Column("4")] + [LoadColumn(4)] public float PetalWidth; } diff --git a/test/Microsoft.ML.FSharp.Tests/SmokeTests.fs b/test/Microsoft.ML.FSharp.Tests/SmokeTests.fs index b8b06bb8c3..a5a6eb0ce7 100644 --- a/test/Microsoft.ML.FSharp.Tests/SmokeTests.fs +++ b/test/Microsoft.ML.FSharp.Tests/SmokeTests.fs @@ -65,9 +65,9 @@ open Xunit module SmokeTest1 = type SentimentData() = - [] + [] val mutable SentimentText : string - [] + [] val mutable Sentiment : float32 type SentimentPrediction() = @@ -131,10 +131,10 @@ module SmokeTest2 = [] type SentimentData = - { [] + { [] SentimentText : string - [] + [] Sentiment : float32 } [] @@ -197,10 +197,10 @@ module SmokeTest2 = module SmokeTest3 = type SentimentData() = - [] + [] member val SentimentText = "".AsMemory() with get, set - [] + [] member val Sentiment = 0.0 with get, set type SentimentPrediction() = diff --git a/test/Microsoft.ML.Tests/CollectionDataSourceTests.cs b/test/Microsoft.ML.Tests/CollectionDataSourceTests.cs index 1fc6e151a0..17ba426483 100644 --- a/test/Microsoft.ML.Tests/CollectionDataSourceTests.cs +++ b/test/Microsoft.ML.Tests/CollectionDataSourceTests.cs @@ -175,10 +175,10 @@ public void CanTrain() public void CanTrainProperties() { var pipeline = new Legacy.LearningPipeline(); - var data = new List() { - new IrisDataProperties { SepalLength = 1f, SepalWidth = 1f, PetalLength=0.3f, PetalWidth=5.1f, Label=1}, - new IrisDataProperties { SepalLength = 1f, SepalWidth = 1f, PetalLength=0.3f, PetalWidth=5.1f, Label=1}, - new IrisDataProperties { SepalLength = 1.2f, SepalWidth = 0.5f, PetalLength=0.3f, PetalWidth=5.1f, Label=0} + var data = new List() { + new IrisData { SepalLength = 1f, SepalWidth = 1f, PetalLength=0.3f, PetalWidth=5.1f, Label=1}, + new IrisData { SepalLength = 1f, SepalWidth = 1f, PetalLength=0.3f, PetalWidth=5.1f, Label=1}, + new IrisData { SepalLength = 1.2f, SepalWidth = 0.5f, PetalLength=0.3f, PetalWidth=5.1f, Label=0} }; var collection = CollectionDataSource.Create(data); @@ -186,9 +186,9 @@ public void CanTrainProperties() pipeline.Add(new ColumnConcatenator(outputColumn: "Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth")); pipeline.Add(new StochasticDualCoordinateAscentClassifier()); - var model = pipeline.Train(); + var model = pipeline.Train(); - IrisPredictionProperties prediction = model.Predict(new IrisDataProperties() + IrisPredictionProperties prediction = model.Predict(new IrisData { SepalLength = 3.3f, SepalWidth = 1.6f, @@ -202,9 +202,9 @@ public void CanTrainProperties() pipeline.Add(new ColumnConcatenator(outputColumn: "Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth")); pipeline.Add(new StochasticDualCoordinateAscentClassifier()); - model = pipeline.Train(); + model = pipeline.Train(); - prediction = model.Predict(new IrisDataProperties() + prediction = model.Predict(new IrisData { SepalLength = 3.3f, SepalWidth = 1.6f, @@ -216,28 +216,28 @@ public void CanTrainProperties() public class Input { - [Column("0")] + [LoadColumn(0)] public float Number1; - [Column("1")] + [LoadColumn(1)] public string String1; } public class IrisData { - [Column("0")] + [LoadColumn(0)] public float Label; - [Column("1")] + [LoadColumn(1)] public float SepalLength; - [Column("2")] + [LoadColumn(2)] public float SepalWidth; - [Column("3")] + [LoadColumn(3)] public float PetalLength; - [Column("4")] + [LoadColumn(4)] public float PetalWidth; } @@ -247,30 +247,6 @@ public class IrisPrediction public float[] PredictedLabels; } - public class IrisDataProperties - { - private float _Label; - private float _SepalLength; - private float _SepalWidth; - private float _PetalLength; - private float _PetalWidth; - - [Column("0")] - public float Label { get { return _Label; } set { _Label = value; } } - - [Column("1")] - public float SepalLength { get { return _SepalLength; } set { _SepalLength = value; } } - - [Column("2")] - public float SepalWidth { get { return _SepalWidth; } set { _SepalWidth = value; } } - - [Column("3")] - public float PetalLength { get { return _PetalLength; } set { _PetalLength = value; } } - - [Column("4")] - public float PetalWidth { get { return _PetalWidth; } set { _PetalWidth = value; } } - } - public class IrisPredictionProperties { private float[] _PredictedLabels; diff --git a/test/Microsoft.ML.Tests/LearningPipelineTests.cs b/test/Microsoft.ML.Tests/LearningPipelineTests.cs index 64c6abd8cc..651d29a940 100644 --- a/test/Microsoft.ML.Tests/LearningPipelineTests.cs +++ b/test/Microsoft.ML.Tests/LearningPipelineTests.cs @@ -49,7 +49,7 @@ public void CanAddAndRemoveFromPipeline() private class InputData { - [Column(ordinal: "1")] + [LoadColumn(columnIndex: 1)] public string F1; } diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/ApiScenariosTests.cs b/test/Microsoft.ML.Tests/Scenarios/Api/ApiScenariosTests.cs index eb79c12aa2..47b33e313f 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/ApiScenariosTests.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/ApiScenariosTests.cs @@ -3,7 +3,6 @@ // See the LICENSE file in the project root for more information. using Microsoft.ML.Data; -using Microsoft.ML.Runtime.Data; using Microsoft.ML.TestFramework; using Xunit.Abstractions; @@ -20,14 +19,22 @@ public ApiScenariosTests(ITestOutputHelper output) : base(output) public class IrisData : IrisDataNoLabel { + [LoadColumn(4), ColumnName("Label")] public string Label; } public class IrisDataNoLabel { + [LoadColumn(0)] public float SepalLength; + + [LoadColumn(1)] public float SepalWidth; + + [LoadColumn(2)] public float PetalLength; + + [LoadColumn(3)] public float PetalWidth; } @@ -39,8 +46,10 @@ public class IrisPrediction public class SentimentData { - [ColumnName("Label")] + [LoadColumn(0), ColumnName("Label")] public bool Sentiment; + + [LoadColumn(1)] public string SentimentText; } diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs index 7569adb2fd..b0075f9975 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/CookbookSamples/CookbookSamplesDynamicApi.cs @@ -40,15 +40,8 @@ private void IntermediateData(string dataPath) // as a catalog of available operations and as the source of randomness. var mlContext = new MLContext(); - // Create the reader: define the data columns and where to find them in the text file. - var reader = mlContext.Data.CreateTextReader(new[] { - // A boolean column depicting the 'label'. - new TextLoader.Column("IsOver50K", DataKind.BL, 0), - // Three text columns. - new TextLoader.Column("Workclass", DataKind.TX, 1), - new TextLoader.Column("Education", DataKind.TX, 2), - new TextLoader.Column("MaritalStatus", DataKind.TX, 3) - }, + // Read the data into a data view. + var data = mlContext.Data.ReadFromTextFile(dataPath, // First line of the file is a header, not a data row. hasHeader: true ); @@ -57,17 +50,13 @@ private void IntermediateData(string dataPath) // together into one. var dynamicPipeline = mlContext.Transforms.Concatenate("AllFeatures", "Education", "MaritalStatus"); - // Let's verify that the data has been read correctly. - // First, we read the data file. - var data = reader.Read(dataPath); - // Fit our data pipeline and transform data with it. var transformedData = dynamicPipeline.Fit(data).Transform(data); // 'transformedData' is a 'promise' of data. Let's actually read it. var someRows = transformedData // Convert to an enumerable of user-defined type. - .AsEnumerable(mlContext, reuseRowObject: false) + .AsEnumerable(mlContext, reuseRowObject: false) // Take a couple values as an array. .Take(4).ToArray(); @@ -90,23 +79,14 @@ private void TrainRegression(string trainDataPath, string testDataPath, string m var mlContext = new MLContext(); // Step one: read the data as an IDataView. - // First, we define the reader: specify the data columns and where to find them in the text file. - var reader = mlContext.Data.CreateTextReader(new[] { - // We read the first 11 values as a single float vector. - new TextLoader.Column("FeatureVector", DataKind.R4, 0, 10), - - // Separately, read the target variable. - new TextLoader.Column("Target", DataKind.R4, 11), - }, + // Read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed). + var trainData = mlContext.Data.ReadFromTextFile(trainDataPath, // First line of the file is a header, not a data row. hasHeader: true, // Default separator is tab, but we need a semicolon. separatorChar: ';' ); - // Now read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed). - var trainData = reader.Read(trainDataPath); - // Sometime, caching data in-memory after its first access can save some loading time when the data is going to be used // several times somewhere. The caching mechanism is also lazy; it only caches things after being used. // User can replace all the subsequently uses of "trainData" with "cachedTrainData". We still use "trainData" because @@ -136,7 +116,13 @@ private void TrainRegression(string trainDataPath, string testDataPath, string m var model = dynamicPipeline.Fit(trainData); // Read the test dataset. - var testData = reader.Read(testDataPath); + var testData = mlContext.Data.ReadFromTextFile(testDataPath, + // First line of the file is a header, not a data row. + hasHeader: true, + // Default separator is tab, but we need a semicolon. + separatorChar: ';' + ); + // Calculate metrics of the model on the test data. var metrics = mlContext.Regression.Evaluate(model.Transform(testData), label: "Target"); @@ -166,29 +152,19 @@ private ITransformer TrainOnIris(string irisDataPath) var mlContext = new MLContext(); // Step one: read the data as an IDataView. - // First, we define the reader: specify the data columns and where to find them in the text file. - var reader = mlContext.Data.CreateTextReader(new[] { - new TextLoader.Column("SepalLength", DataKind.R4, 0), - new TextLoader.Column("SepalWidth", DataKind.R4, 1), - new TextLoader.Column("PetalLength", DataKind.R4, 2), - new TextLoader.Column("PetalWidth", DataKind.R4, 3), - // Label: kind of iris. - new TextLoader.Column("Label", DataKind.TX, 4), - }, + // Retrieve the training data. + var trainData = mlContext.Data.ReadFromTextFile(irisDataPath, // Default separator is tab, but the dataset has comma. separatorChar: ',' ); - // Retrieve the training data. - var trainData = reader.Read(irisDataPath); - // Build the training pipeline. var dynamicPipeline = // Concatenate all the features together into one column 'Features'. mlContext.Transforms.Concatenate("Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth") // Note that the label is text, so it needs to be converted to key. .Append(mlContext.Transforms.Conversion.MapValueToKey("Label"), TransformerScope.TrainTest) - // Cache data in moemory for steps after the cache check point stage. + // Cache data in memory for steps after the cache check point stage. .AppendCacheCheckpoint(mlContext) // Use the multi-class SDCA model to predict the label using features. .Append(mlContext.MulticlassClassification.Trainers.StochasticDualCoordinateAscent()) @@ -233,20 +209,12 @@ private void NormalizationWorkout(string dataPath) // as a catalog of available operations and as the source of randomness. var mlContext = new MLContext(); - // Define the reader: specify the data columns and where to find them in the text file. - var reader = mlContext.Data.CreateTextReader(new[] { - // The four features of the Iris dataset will be grouped together as one Features column. - new TextLoader.Column("Features", DataKind.R4, 0, 3), - // Label: kind of iris. - new TextLoader.Column("Label", DataKind.TX, 4), - }, + // Read the training data. + var trainData = mlContext.Data.ReadFromTextFile(dataPath, // Default separator is tab, but the dataset has comma. separatorChar: ',' ); - // Read the training data. - var trainData = reader.Read(dataPath); - // Apply all kinds of standard ML.NET normalization to the raw features. var pipeline = mlContext.Transforms.Normalize( @@ -265,17 +233,6 @@ private void NormalizationWorkout(string dataPath) public void Normalization() => NormalizationWorkout(GetDataPath("iris.data")); - private class IrisInput - { - // Unfortunately, we still need the dummy 'Label' column to be present. - [ColumnName("Label")] - public string IgnoredLabel { get; set; } - public float SepalLength { get; set; } - public float SepalWidth { get; set; } - public float PetalLength { get; set; } - public float PetalWidth { get; set; } - } - private IEnumerable GetChurnInfo() { var r = new Random(454); @@ -425,24 +382,11 @@ private void CrossValidationOn(string dataPath) var mlContext = new MLContext(); // Step one: read the data as an IDataView. - // First, we define the reader: specify the data columns and where to find them in the text file. - var reader = mlContext.Data.CreateTextReader(new[] - { - // We read the first 11 values as a single float vector. - new TextLoader.Column("SepalLength", DataKind.R4, 0), - new TextLoader.Column("SepalWidth", DataKind.R4, 1), - new TextLoader.Column("PetalLength", DataKind.R4, 2), - new TextLoader.Column("PetalWidth", DataKind.R4, 3), - // Label: kind of iris. - new TextLoader.Column("Label", DataKind.TX, 4), - }, + var data = mlContext.Data.ReadFromTextFile(dataPath, // Default separator is tab, but the dataset has comma. separatorChar: ',' ); - // Read the data. - var data = reader.Read(dataPath); - // Build the training pipeline. var dynamicPipeline = // Concatenate all the features together into one column 'Features'. @@ -486,18 +430,10 @@ private void ReadDataDynamic(string dataPath) // as a catalog of available operations and as the source of randomness. var mlContext = new MLContext(); - // Create the reader: define the data columns and where to find them in the text file. - var reader = mlContext.Data.CreateTextReader(new[] { - // We read the first 10 values as a single float vector. - new TextLoader.Column("FeatureVector", DataKind.R4, new[] {new TextLoader.Range(0, 9)}), - // Separately, read the target variable. - new TextLoader.Column("Target", DataKind.R4, 10) - }, + // Now read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed). + var reader = mlContext.Data.ReadFromTextFile(dataPath, // Default separator is tab, but we need a comma. separatorChar: ',' ); - - // Now read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed). - var data = reader.Read(dataPath); } // Define a class for all the input columns that we intend to consume. @@ -616,11 +552,62 @@ private class IrisPrediction private class InspectedRow { + [LoadColumn(0)] public bool IsOver50K { get; set; } + + [LoadColumn(1)] public string Workclass { get; set; } + + [LoadColumn(2)] public string Education { get; set; } + + [LoadColumn(3)] public string MaritalStatus { get; set; } + + } + + private class InspectedRowWithAllFeatures : InspectedRow + { public string[] AllFeatures { get; set; } } + + private class IrisInput + { + // Unfortunately, we still need the dummy 'Label' column to be present. + [ColumnName("Label"), LoadColumn(4)] + public string IgnoredLabel { get; set; } + + [LoadColumn(0)] + public float SepalLength { get; set; } + + [LoadColumn(1)] + public float SepalWidth { get; set; } + + [LoadColumn(2)] + public float PetalLength { get; set; } + + [LoadColumn(3)] + public float PetalWidth { get; set; } + } + + private class IrisInputAllFeatures + { + // Unfortunately, we still need the dummy 'Label' column to be present. + [ColumnName("Label"), LoadColumn(4)] + public string IgnoredLabel { get; set; } + + [LoadColumn(0, 3)] + public float Features { get; set; } + } + + private class AdultData + { + [LoadColumn(0, 10), ColumnName("FeatureVector")] + public float Features { get; set; } + + [LoadColumn(11)] + public float Target { get; set; } + } + } } diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/CrossValidation.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/CrossValidation.cs index b660abcf00..057c9c3bf5 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/CrossValidation.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/CrossValidation.cs @@ -27,7 +27,8 @@ void New_CrossValidation() { var ml = new MLContext(seed: 1, conc: 1); - var data = ml.Data.CreateTextReader(TestDatasets.Sentiment.GetLoaderColumns(), hasHeader: true).Read(GetDataPath(TestDatasets.Sentiment.trainFilename)); + var data = ml.Data.ReadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename), hasHeader: true); + // Pipeline. var pipeline = ml.Transforms.Text.FeaturizeText("SentimentText", "Features") .Append(ml.BinaryClassification.Trainers.StochasticDualCoordinateAscent("Label", "Features", advancedSettings: (s) => { s.ConvergenceTolerance = 1f; s.NumThreads = 1; })); diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/DecomposableTrainAndPredict.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/DecomposableTrainAndPredict.cs index de30adfee9..3fe1bf3db9 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/DecomposableTrainAndPredict.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/DecomposableTrainAndPredict.cs @@ -29,8 +29,7 @@ void New_DecomposableTrainAndPredict() var dataPath = GetDataPath(TestDatasets.irisData.trainFilename); var ml = new MLContext(); - var data = ml.Data.CreateTextReader(TestDatasets.irisData.GetLoaderColumns(), separatorChar: ',') - .Read(dataPath); + var data = ml.Data.ReadFromTextFile(dataPath, separatorChar: ','); var pipeline = new ColumnConcatenatingEstimator (ml, "Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth") .Append(new ValueToKeyMappingEstimator(ml, "Label"), TransformerScope.TrainTest) diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/IntrospectiveTraining.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/IntrospectiveTraining.cs index 49c30579e7..d0c40aaee4 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/IntrospectiveTraining.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/IntrospectiveTraining.cs @@ -34,8 +34,7 @@ public partial class ApiScenariosTests public void New_IntrospectiveTraining() { var ml = new MLContext(seed: 1, conc: 1); - var data = ml.Data.CreateTextReader(TestDatasets.Sentiment.GetLoaderColumns(), hasHeader: true) - .Read(GetDataPath(TestDatasets.Sentiment.trainFilename)); + var data = ml.Data.ReadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename), hasHeader: true); var pipeline = ml.Transforms.Text.FeaturizeText("SentimentText", "Features") .AppendCacheCheckpoint(ml) diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Metacomponents.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Metacomponents.cs index 099a8f5484..410a26f6fa 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Metacomponents.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/Metacomponents.cs @@ -26,8 +26,7 @@ public partial class ApiScenariosTests public void New_Metacomponents() { var ml = new MLContext(); - var data = ml.Data.CreateTextReader(TestDatasets.irisData.GetLoaderColumns(), separatorChar: ',') - .Read(GetDataPath(TestDatasets.irisData.trainFilename)); + var data = ml.Data.ReadFromTextFile(GetDataPath(TestDatasets.irisData.trainFilename), separatorChar: ','); var sdcaTrainer = ml.BinaryClassification.Trainers.StochasticDualCoordinateAscent("Label", "Features", advancedSettings: (s) => { s.MaxIterations = 100; s.Shuffle = true; s.NumThreads = 1; }); diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/MultithreadedPrediction.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/MultithreadedPrediction.cs index f449d0cbaa..bb7ae2bfbe 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/MultithreadedPrediction.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/MultithreadedPrediction.cs @@ -26,8 +26,7 @@ public partial class ApiScenariosTests void New_MultithreadedPrediction() { var ml = new MLContext(seed: 1, conc: 1); - var reader = ml.Data.CreateTextReader(TestDatasets.Sentiment.GetLoaderColumns(), hasHeader: true); - var data = reader.Read(new MultiFileSource(GetDataPath(TestDatasets.Sentiment.trainFilename))); + var data = ml.Data.ReadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename), hasHeader: true); // Pipeline. var pipeline = ml.Transforms.Text.FeaturizeText("SentimentText", "Features") @@ -41,7 +40,7 @@ void New_MultithreadedPrediction() var engine = model.CreatePredictionEngine(ml); // Take a couple examples out of the test data and run predictions on top. - var testData = reader.Read(new MultiFileSource(GetDataPath(TestDatasets.Sentiment.testFilename))) + var testData = ml.Data.ReadFromTextFile(GetDataPath(TestDatasets.Sentiment.testFilename), hasHeader: true) .AsEnumerable(ml, false); Parallel.ForEach(testData, (input) => diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/ReconfigurablePrediction.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/ReconfigurablePrediction.cs index 8d95868a4a..a3d0ef5223 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/ReconfigurablePrediction.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/ReconfigurablePrediction.cs @@ -22,10 +22,10 @@ public partial class ApiScenariosTests public void New_ReconfigurablePrediction() { var ml = new MLContext(seed: 1, conc: 1); - var dataReader = ml.Data.CreateTextReader(TestDatasets.Sentiment.GetLoaderColumns(), hasHeader: true); + var dataReader = ml.Data.ReadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename), hasHeader: true); - var data = dataReader.Read(GetDataPath(TestDatasets.Sentiment.trainFilename)); - var testData = dataReader.Read(GetDataPath(TestDatasets.Sentiment.testFilename)); + var data = ml.Data.ReadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename), hasHeader: true); + var testData = ml.Data.ReadFromTextFile(GetDataPath(TestDatasets.Sentiment.testFilename), hasHeader: true); // Pipeline. var pipeline = ml.Transforms.Text.FeaturizeText("SentimentText", "Features") diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/SimpleTrainAndPredict.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/SimpleTrainAndPredict.cs index d4c4480454..59e4aaefaf 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/SimpleTrainAndPredict.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/SimpleTrainAndPredict.cs @@ -22,8 +22,8 @@ public partial class ApiScenariosTests public void New_SimpleTrainAndPredict() { var ml = new MLContext(seed: 1, conc: 1); - var reader = ml.Data.CreateTextReader(TestDatasets.Sentiment.GetLoaderColumns(), hasHeader: true); - var data = reader.Read(GetDataPath(TestDatasets.Sentiment.trainFilename)); + var data = ml.Data.ReadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename), hasHeader: true); + // Pipeline. var pipeline = ml.Transforms.Text.FeaturizeText("SentimentText", "Features") .AppendCacheCheckpoint(ml) @@ -36,7 +36,7 @@ public void New_SimpleTrainAndPredict() var engine = model.CreatePredictionEngine(ml); // Take a couple examples out of the test data and run predictions on top. - var testData = reader.Read(GetDataPath(TestDatasets.Sentiment.testFilename)) + var testData = ml.Data.ReadFromTextFile(GetDataPath(TestDatasets.Sentiment.testFilename), hasHeader: true) .AsEnumerable(ml, false); foreach (var input in testData.Take(5)) { diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/TrainSaveModelAndPredict.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/TrainSaveModelAndPredict.cs index a83ee4bd92..0d50865e4e 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/TrainSaveModelAndPredict.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/TrainSaveModelAndPredict.cs @@ -24,8 +24,7 @@ public partial class ApiScenariosTests public void New_TrainSaveModelAndPredict() { var ml = new MLContext(seed: 1, conc: 1); - var reader = ml.Data.CreateTextReader(TestDatasets.Sentiment.GetLoaderColumns(), hasHeader: true); - var data = reader.Read(GetDataPath(TestDatasets.Sentiment.trainFilename)); + var data = ml.Data.ReadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename), hasHeader: true); // Pipeline. var pipeline = ml.Transforms.Text.FeaturizeText("SentimentText", "Features") @@ -49,7 +48,7 @@ public void New_TrainSaveModelAndPredict() var engine = loadedModel.CreatePredictionEngine(ml); // Take a couple examples out of the test data and run predictions on top. - var testData = reader.Read(GetDataPath(TestDatasets.Sentiment.testFilename)) + var testData = ml.Data.ReadFromTextFile(GetDataPath(TestDatasets.Sentiment.testFilename), hasHeader: true) .AsEnumerable(ml, false); foreach (var input in testData.Take(5)) { diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/TrainWithInitialPredictor.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/TrainWithInitialPredictor.cs index a471bb7858..6b10338782 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/TrainWithInitialPredictor.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/TrainWithInitialPredictor.cs @@ -22,7 +22,7 @@ public void New_TrainWithInitialPredictor() var ml = new MLContext(seed: 1, conc: 1); - var data = ml.Data.CreateTextReader(TestDatasets.Sentiment.GetLoaderColumns(), hasHeader: true).Read(GetDataPath(TestDatasets.Sentiment.trainFilename)); + var data = ml.Data.ReadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename), hasHeader: true); // Pipeline. var pipeline = ml.Transforms.Text.FeaturizeText("SentimentText", "Features"); diff --git a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/TrainWithValidationSet.cs b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/TrainWithValidationSet.cs index 14593d0b85..b5550c2f73 100644 --- a/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/TrainWithValidationSet.cs +++ b/test/Microsoft.ML.Tests/Scenarios/Api/Estimators/TrainWithValidationSet.cs @@ -20,14 +20,14 @@ public void New_TrainWithValidationSet() { var ml = new MLContext(seed: 1, conc: 1); // Pipeline. - var reader = ml.Data.CreateTextReader(TestDatasets.Sentiment.GetLoaderColumns(), hasHeader: true); + var data = ml.Data.ReadFromTextFile(GetDataPath(TestDatasets.Sentiment.trainFilename), hasHeader: true); var pipeline = ml.Transforms.Text.FeaturizeText("SentimentText", "Features"); // Train the pipeline, prepare train and validation set. - var data = reader.Read(GetDataPath(TestDatasets.Sentiment.trainFilename)); var preprocess = pipeline.Fit(data); var trainData = preprocess.Transform(data); - var validData = preprocess.Transform(reader.Read(GetDataPath(TestDatasets.Sentiment.testFilename))); + var validDataSource = ml.Data.ReadFromTextFile(GetDataPath(TestDatasets.Sentiment.testFilename), hasHeader: true); + var validData = preprocess.Transform(validDataSource); // Train model with validation set. var trainer = ml.BinaryClassification.Trainers.FastTree("Label","Features"); diff --git a/test/Microsoft.ML.Tests/Scenarios/ClusteringTests.cs b/test/Microsoft.ML.Tests/Scenarios/ClusteringTests.cs index 9e796428de..049ff574d1 100644 --- a/test/Microsoft.ML.Tests/Scenarios/ClusteringTests.cs +++ b/test/Microsoft.ML.Tests/Scenarios/ClusteringTests.cs @@ -47,16 +47,16 @@ shall not be infringed."" public class NewsData { - [Column(ordinal: "0")] + [LoadColumn(0)] public string Id; - [Column(ordinal: "1", name: "Label")] + [LoadColumn(1) , ColumnName("Label")] public string Topic; - [Column(ordinal: "2")] + [LoadColumn(2)] public string Subject; - [Column(ordinal: "3")] + [LoadColumn(3)] public string Content; } diff --git a/test/Microsoft.ML.Tests/Scenarios/HousePricePredictionTests.cs b/test/Microsoft.ML.Tests/Scenarios/HousePricePredictionTests.cs index 180de2b766..ef8f704f4d 100644 --- a/test/Microsoft.ML.Tests/Scenarios/HousePricePredictionTests.cs +++ b/test/Microsoft.ML.Tests/Scenarios/HousePricePredictionTests.cs @@ -53,67 +53,67 @@ public async void PredictHousePriceModelTest() public class HousePriceData { - [Column(ordinal: "0")] + [LoadColumn(0)] public string Id; - [Column(ordinal: "1")] + [LoadColumn(1)] public string Date; - [Column(ordinal: "2", name: "Label")] + [LoadColumn(2), ColumnName("Label")] public float Price; - [Column(ordinal: "3")] + [LoadColumn(3)] public float Bedrooms; - [Column(ordinal: "4")] + [LoadColumn(4)] public float Bathrooms; - [Column(ordinal: "5")] + [LoadColumn(5)] public float SqftLiving; - [Column(ordinal: "6")] + [LoadColumn(6)] public float SqftLot; - [Column(ordinal: "7")] + [LoadColumn(7)] public float Floors; - [Column(ordinal: "8")] + [LoadColumn(8)] public float Waterfront; - [Column(ordinal: "9")] + [LoadColumn(9)] public float View; - [Column(ordinal: "10")] + [LoadColumn(10)] public float Condition; - [Column(ordinal: "11")] + [LoadColumn(11)] public float Grade; - [Column(ordinal: "12")] + [LoadColumn(12)] public float SqftAbove; - [Column(ordinal: "13")] + [LoadColumn(13)] public float SqftBasement; - [Column(ordinal: "14")] + [LoadColumn(14)] public float YearBuilt; - [Column(ordinal: "15")] + [LoadColumn(15)] public float YearRenovated; - [Column(ordinal: "16")] + [LoadColumn(16)] public float Zipcode; - [Column(ordinal: "17")] + [LoadColumn(17)] public float Lat; - [Column(ordinal: "18")] + [LoadColumn(18)] public float Long; - [Column(ordinal: "19")] + [LoadColumn(19)] public float SqftLiving15; - [Column(ordinal: "20")] + [LoadColumn(20)] public float SqftLot15; } diff --git a/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationTests.cs b/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationTests.cs index ac76ae71f3..ab295ddca5 100644 --- a/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationTests.cs +++ b/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationTests.cs @@ -117,19 +117,19 @@ public void TrainAndPredictIrisModelTest() public class IrisData { - [Column("0")] + [LoadColumn(0)] public float Label; - [Column("1")] + [LoadColumn(1)] public float SepalLength; - [Column("2")] + [LoadColumn(2)] public float SepalWidth; - [Column("3")] + [LoadColumn(3)] public float PetalLength; - [Column("4")] + [LoadColumn(4)] public float PetalWidth; } diff --git a/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationWithStringLabelTests.cs b/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationWithStringLabelTests.cs index 5a5f0ebe7a..6ad0059032 100644 --- a/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationWithStringLabelTests.cs +++ b/test/Microsoft.ML.Tests/Scenarios/IrisPlantClassificationWithStringLabelTests.cs @@ -127,19 +127,19 @@ public void TrainAndPredictIrisModelWithStringLabelTest() public class IrisDataWithStringLabel { - [Column("0")] + [LoadColumn(0)] public float SepalLength; - [Column("1")] + [LoadColumn(1)] public float SepalWidth; - [Column("2")] + [LoadColumn(2)] public float PetalLength; - [Column("3")] + [LoadColumn(3)] public float PetalWidth; - [Column("4", name: "Label")] + [LoadColumn(4), ColumnName("Label")] public string IrisPlantType; } } diff --git a/test/Microsoft.ML.Tests/Scenarios/PipelineApi/PipelineApiScenarioTests.cs b/test/Microsoft.ML.Tests/Scenarios/PipelineApi/PipelineApiScenarioTests.cs index 93438995ef..488827881c 100644 --- a/test/Microsoft.ML.Tests/Scenarios/PipelineApi/PipelineApiScenarioTests.cs +++ b/test/Microsoft.ML.Tests/Scenarios/PipelineApi/PipelineApiScenarioTests.cs @@ -21,22 +21,22 @@ public PipelineApiScenarioTests(ITestOutputHelper output) : base(output) public class IrisData : IrisDataNoLabel { - [Column("0")] + [LoadColumn(0)] public string Label; } public class IrisDataNoLabel { - [Column("1")] + [LoadColumn(1)] public float SepalLength; - [Column("2")] + [LoadColumn(2)] public float SepalWidth; - [Column("3")] + [LoadColumn(3)] public float PetalLength; - [Column("4")] + [LoadColumn(4)] public float PetalWidth; } @@ -47,9 +47,9 @@ public class IrisPrediction public class SentimentData { - [Column("0", name: "Label")] + [LoadColumn(0), ColumnName("Label")] public bool Sentiment; - [Column("1")] + [LoadColumn(1)] public string SentimentText; } diff --git a/test/Microsoft.ML.Tests/Scenarios/SentimentPredictionTests.cs b/test/Microsoft.ML.Tests/Scenarios/SentimentPredictionTests.cs index cb81c685fc..542a3dab97 100644 --- a/test/Microsoft.ML.Tests/Scenarios/SentimentPredictionTests.cs +++ b/test/Microsoft.ML.Tests/Scenarios/SentimentPredictionTests.cs @@ -504,9 +504,9 @@ private IEnumerable GetTestData() public class SentimentData { - [Column(ordinal: "0", name: "Label")] + [LoadColumn(0), ColumnName("Label")] public float Sentiment; - [Column(ordinal: "1")] + [LoadColumn(1)] public string SentimentText; } diff --git a/test/Microsoft.ML.Tests/Scenarios/TensorflowTests.cs b/test/Microsoft.ML.Tests/Scenarios/TensorflowTests.cs index 7c74b086b6..c53c68d496 100644 --- a/test/Microsoft.ML.Tests/Scenarios/TensorflowTests.cs +++ b/test/Microsoft.ML.Tests/Scenarios/TensorflowTests.cs @@ -73,10 +73,10 @@ public void TensorFlowTransforCifarEndToEndTest() public class CifarData { - [Column("0")] + [LoadColumn(0)] public string ImagePath; - [Column("1")] + [LoadColumn(1)] public string Label; } @@ -88,10 +88,10 @@ public class CifarPrediction public class ImageNetData { - [Column("0")] + [LoadColumn(0)] public string ImagePath; - [Column("1")] + [LoadColumn(1)] public string Label; } diff --git a/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs b/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs index 27ed4b982e..bfa32178d6 100644 --- a/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs +++ b/test/Microsoft.ML.Tests/ScenariosWithDirectInstantiation/TensorflowTests.cs @@ -585,7 +585,6 @@ public class MNISTData [Column("0")] public long Label; - [Column(ordinal: "1-784")] [VectorType(784)] public float[] Placeholder; } diff --git a/test/Microsoft.ML.Tests/TextLoaderTests.cs b/test/Microsoft.ML.Tests/TextLoaderTests.cs index bc3b1db7e2..d425bd8bdf 100644 --- a/test/Microsoft.ML.Tests/TextLoaderTests.cs +++ b/test/Microsoft.ML.Tests/TextLoaderTests.cs @@ -8,6 +8,7 @@ using Microsoft.ML.Runtime.RunTests; using Microsoft.ML.TestFramework; using System; +using System.Collections.Generic; using System.IO; using Xunit; using Xunit.Abstractions; @@ -336,7 +337,7 @@ public void CanSuccessfullyTrimSpaces() public void ThrowsExceptionWithPropertyName() { Exception ex = Assert.Throws(() => new Legacy.Data.TextLoader("fakefile.txt").CreateFrom()); - Assert.StartsWith("Field or property String1 is missing ColumnAttribute", ex.Message); + Assert.StartsWith($"Field or property String1 is missing {nameof(LoadColumnAttribute)}", ex.Message); } [Fact] @@ -350,46 +351,46 @@ public void CanSuccessfullyColumnNameProperty() public class QuoteInput { - [Column("0")] + [LoadColumn(0)] public float ID; - [Column("1")] + [LoadColumn(1)] public string Text; } public class SparseInput { - [Column("0")] + [LoadColumn(0)] public float C1; - [Column("1")] + [LoadColumn(1)] public float C2; - [Column("2")] + [LoadColumn(2)] public float C3; - [Column("3")] + [LoadColumn(3)] public float C4; - [Column("4")] + [LoadColumn(4)] public float C5; } public class Input { - [Column("0")] + [LoadColumn(0)] public string String1; - [Column("1")] + [LoadColumn(1)] public float Number1; } public class InputWithUnderscore { - [Column("0")] + [LoadColumn(0)] public string String_1; - [Column("1")] + [LoadColumn(1)] public float Number_1; } @@ -400,14 +401,125 @@ public class ModelWithoutColumnAttribute public class ModelWithColumnNameAttribute { - [Column("0", "Col1")] + [LoadColumn(0), ColumnName("Col1")] public string String_1; - [Column("1")] + + [LoadColumn(1)] [ColumnName("Col2")] public string String_2; - [Column("3")] + + [LoadColumn(3)] public string String_3; } } + + public class TextLoaderFromModelTests : BaseTestClass + { + public TextLoaderFromModelTests(ITestOutputHelper output) + : base(output) + { + + } + + public class Iris + { + [LoadColumn(0)] + public float SepalLength; + + [LoadColumn(1)] + public float SepalWidth; + + [LoadColumn(2)] + public float PetalLength; + + [LoadColumn(3)] + public float PetalWidth; + + [LoadColumn(4)] + public string Type; + } + + public class IrisStartEnd + { + [LoadColumn(start:0, end:3), ColumnName("Features")] + public float Features; + + [LoadColumn(4), ColumnName("Label")] + public string Type; + } + + public class IrisColumnIndices + { + [LoadColumn(columnIndexes: new[] { 0, 2 })] + public float Features; + + [LoadColumn(4), ColumnName("Label")] + public string Type; + } + + [Fact] + public void LoaderColumnsFromIrisData() + { + var dataPath = GetDataPath(TestDatasets.irisData.trainFilename); + var ml = new MLContext(); + + var irisFirstRow = new Dictionary(); + irisFirstRow["SepalLength"] = 5.1f; + irisFirstRow["SepalWidth"] = 3.5f; + irisFirstRow["PetalLength"] = 1.4f; + irisFirstRow["PetalWidth"] = 0.2f; + + var irisFirstRowValues = irisFirstRow.Values.GetEnumerator(); + + // Simple load + var dataIris = ml.Data.CreateTextReader(separatorChar: ',').Read(dataPath); + var previewIris = dataIris.Preview(1); + + Assert.Equal(5, previewIris.ColumnView.Length); + Assert.Equal("SepalLength", previewIris.Schema[0].Name); + Assert.Equal(NumberType.R4, previewIris.Schema[0].Type); + int index = 0; + foreach (var entry in irisFirstRow) + { + Assert.Equal(entry.Key, previewIris.RowView[0].Values[index].Key); + Assert.Equal(entry.Value, previewIris.RowView[0].Values[index++].Value); + } + Assert.Equal("Type", previewIris.RowView[0].Values[index].Key); + Assert.Equal("Iris-setosa", previewIris.RowView[0].Values[index].Value.ToString()); + + // Load with start and end indexes + var dataIrisStartEnd = ml.Data.CreateTextReader(separatorChar: ',').Read(dataPath); + var previewIrisStartEnd = dataIrisStartEnd.Preview(1); + + Assert.Equal(2, previewIrisStartEnd.ColumnView.Length); + Assert.Equal("Features", previewIrisStartEnd.RowView[0].Values[0].Key); + var featureValue = (VBuffer)previewIrisStartEnd.RowView[0].Values[0].Value; + Assert.True(featureValue.IsDense); + Assert.Equal(4, featureValue.Length); + + irisFirstRowValues = irisFirstRow.Values.GetEnumerator(); + foreach (var val in featureValue.GetValues()) + { + irisFirstRowValues.MoveNext(); + Assert.Equal(irisFirstRowValues.Current, val); + } + + // load setting the distinct columns. Loading column 0 and 2 + var dataIrisColumnIndices = ml.Data.CreateTextReader(separatorChar: ',').Read(dataPath); + var previewIrisColumnIndices = dataIrisColumnIndices.Preview(1); + + Assert.Equal(2, previewIrisColumnIndices.ColumnView.Length); + featureValue = (VBuffer)previewIrisColumnIndices.RowView[0].Values[0].Value; + Assert.True(featureValue.IsDense); + Assert.Equal(2, featureValue.Length); + var vals4 = featureValue.GetValues(); + + irisFirstRowValues = irisFirstRow.Values.GetEnumerator(); + irisFirstRowValues.MoveNext(); + Assert.Equal(vals4[0], irisFirstRowValues.Current); + irisFirstRowValues.MoveNext(); irisFirstRowValues.MoveNext(); // skip col 1 + Assert.Equal(vals4[1], irisFirstRowValues.Current); + } + } #pragma warning restore 612, 618 } diff --git a/test/Microsoft.ML.Tests/TrainerEstimators/MetalinearEstimators.cs b/test/Microsoft.ML.Tests/TrainerEstimators/MetalinearEstimators.cs index 0fb1e167ac..2599df8bdd 100644 --- a/test/Microsoft.ML.Tests/TrainerEstimators/MetalinearEstimators.cs +++ b/test/Microsoft.ML.Tests/TrainerEstimators/MetalinearEstimators.cs @@ -70,7 +70,7 @@ public void Pkpd() } [Fact] - public void New_MetacomponentsFeaturesRenamed() + public void MetacomponentsFeaturesRenamed() { var data = new TextLoader(Env, TestDatasets.irisData.GetLoaderColumns(), separatorChar: ',') .Read(GetDataPath(TestDatasets.irisData.trainFilename));