-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Schema based text loader #1878
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Schema based text loader #1878
Changes from 4 commits
f699cb0
3931d13
7cacf58
c69eb4b
31e17d9
e1201bb
e583b88
55e1bdd
01fab7a
c540d7f
ce25b69
0392712
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -68,30 +68,110 @@ public VectorTypeAttribute(params int[] dims) | |
/// column encapsulates. | ||
/// </summary> | ||
[AttributeUsage(AttributeTargets.Field | AttributeTargets.Property, AllowMultiple = false, Inherited = true)] | ||
public sealed class ColumnAttribute : Attribute | ||
public sealed class LoadColumnAttribute : Attribute | ||
{ | ||
public ColumnAttribute(string ordinal, string name = null) | ||
|
||
public LoadColumnAttribute(int ordinal, string name = null, bool loadAllOthers = false) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
XMl docs on all ctors and samples for all ctors #Resolved There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
how about There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
can we remove There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
that's a misleading name. Maybe There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you think this will be useful when the user wants to load all features into one column, and the label in another. we'd spare them having to look up their cols cardinality, and a concat. In reply to: 242673339 [](ancestors = 242673339) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
{ | ||
Start = ordinal.ToString(); | ||
Sources = new List<TextLoader.Range>(); | ||
var range = new TextLoader.Range(ordinal); | ||
range.AllOther = loadAllOthers; | ||
Sources.Add(range); | ||
} | ||
|
||
public LoadColumnAttribute(string start, string end = null, string name = null, int[] columnIndexes = null) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
start and end should be integers here. #Resolved There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can't have nullable ints. we can't leave 'end' as Auto, if i flip to ints. that ok? In reply to: 242669241 [](ancestors = 242669241) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we want to preserve the '10-*' logic, I would go with a different attribute for it ( In reply to: 242741038 [](ancestors = 242741038,242669241) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think the '*' is going to be as useful in API as it is in cmdline. In reply to: 243120230 [](ancestors = 243120230,242741038,242669241) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I think this parameter is more misleading than it is helpful. Can you remove it? #Closed |
||
{ | ||
Name = name; | ||
Ordinal = ordinal; | ||
Start = start; | ||
End = end; | ||
ColumnIndexes = columnIndexes; | ||
|
||
Sources = new List<TextLoader.Range>(); | ||
|
||
bool hasEnd = int.TryParse(end, out int endIndex); | ||
var range = hasEnd ? new TextLoader.Range(int.Parse(start), endIndex) : new TextLoader.Range(int.Parse(start)); | ||
Sources.Add(range); | ||
|
||
if (columnIndexes != null) | ||
{ | ||
foreach (var col in columnIndexes) | ||
Sources.Add(new TextLoader.Range(col)); | ||
} | ||
} | ||
|
||
// REVIEW : AllOther seems to work only for a single column. Verify. | ||
public LoadColumnAttribute(string start, string end, string name = null, bool loadInverseRange = false) | ||
{ | ||
Name = name; | ||
LoadInverseRange = loadInverseRange; | ||
Start = start; | ||
End = end; | ||
|
||
Sources = new List<TextLoader.Range>(); | ||
var range = new TextLoader.Range(int.Parse(start), int.Parse(end)); | ||
range.AllOther = loadInverseRange; | ||
Sources.Add(range); | ||
} | ||
|
||
public LoadColumnAttribute(int[] columnIndexes, string name = null) | ||
{ | ||
Name = name; | ||
ColumnIndexes = columnIndexes; | ||
|
||
Sources = new List<TextLoader.Range>(); | ||
foreach (var col in columnIndexes) | ||
Sources.Add(new TextLoader.Range(col)); | ||
} | ||
|
||
internal List<TextLoader.Range> Sources; | ||
|
||
/// <summary> | ||
/// Column name. | ||
/// </summary> | ||
public string Name { get; } | ||
|
||
/// <summary> | ||
/// Contains positions of indices of source columns in the form | ||
/// of ranges. Examples of range: if we want to include just column | ||
/// with index 1 we can write the range as 1, if we want to include | ||
/// columns 1 to 10 then we can write the range as 1-10 and we want to include all the | ||
/// columns from column with index 1 until end then we can write 1-*. | ||
/// | ||
/// This takes sequence of ranges that are comma seperated, example: | ||
/// 1,2-5,10-* | ||
/// The optional start index for loading a contiguous range of columns, or the single index in the case | ||
/// of loading a single column. | ||
/// Either this parameters, or the <see cref="ColumnIndexes"/> should be specified. | ||
/// </summary> | ||
public string Start { get; } | ||
|
||
/// <summary> | ||
/// Optional field, used to set the dataset columns range end index when loading a range of columns. | ||
/// </summary> | ||
public string Ordinal { get; } | ||
public string End { get; } | ||
|
||
/// <summary> | ||
/// Optional field used to specify the distinct indices of the dataset columns that need to be loaded, and mapped to this | ||
/// <see cref="TextLoader.Column"/>. | ||
/// </summary> | ||
public int[] ColumnIndexes { get; } | ||
|
||
/// <summary> | ||
/// If this is set to true, the columns defined in the range through either the <see cref="Start"/>, <see cref="End"/> or the | ||
/// <see cref="ColumnIndexes"/> will be excluded from loading, and all the other ones will loaded and mapped to the <see cref="TextLoader.Column"/>. | ||
/// </summary> | ||
public bool LoadInverseRange { get; } | ||
} | ||
|
||
/// <summary> | ||
/// Describes column information such as name and the source columns indicies that this | ||
/// column encapsulates. | ||
/// </summary> | ||
[AttributeUsage(AttributeTargets.Field | AttributeTargets.Property, AllowMultiple = false, Inherited = true)] | ||
public sealed class ColumnAttribute : Attribute | ||
{ | ||
public ColumnAttribute(string ordinal, string name = null) | ||
{ | ||
Name = name; | ||
} | ||
|
||
/// <summary> | ||
/// Column name. | ||
/// </summary> | ||
public string Name { get; } | ||
} | ||
|
||
/// <summary> | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -343,28 +343,28 @@ public class ArgumentsCore | |
" missing value and an empty value is denoted by \"\". When false, consecutive separators" + | ||
" denote an empty value.", | ||
ShortName = "quote")] | ||
public bool AllowQuoting = true; | ||
public bool AllowQuoting = DefaultArguments.AllowQuoting; | ||
|
||
[Argument(ArgumentType.AtMostOnce, HelpText = "Whether the input may include sparse representations", ShortName = "sparse")] | ||
public bool AllowSparse = true; | ||
public bool AllowSparse = DefaultArguments.AllowSparse; | ||
|
||
[Argument(ArgumentType.AtMostOnce, | ||
HelpText = "Number of source columns in the text data. Default is that sparse rows contain their size information.", | ||
ShortName = "size")] | ||
public int? InputSize; | ||
|
||
[Argument(ArgumentType.AtMostOnce, Visibility = ArgumentAttribute.VisibilityType.CmdLineOnly, HelpText = "Source column separator. Options: tab, space, comma, single character", ShortName = "sep")] | ||
public string Separator = "tab"; | ||
public string Separator = "tab"; //DefaultArguments.Separator | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Is this comment needed? #Resolved There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. or maybe just two default arguments Separator and SeparatorChars? Two different params doesnt guarantee things to be in sync. Im assuming since these are argruments that they can be set by the user -- so what happens if a user specifies Separator to be space, but doesnt set the separator chars? In reply to: 243411408 [](ancestors = 243411408) |
||
|
||
[Argument(ArgumentType.AtMostOnce, Name = nameof(Separator), Visibility = ArgumentAttribute.VisibilityType.EntryPointsOnly, HelpText = "Source column separator.", ShortName = "sep")] | ||
public char[] SeparatorChars = new[] { '\t' }; | ||
public char[] SeparatorChars = new[] { DefaultArguments.Separator }; | ||
|
||
[Argument(ArgumentType.Multiple, HelpText = "Column groups. Each group is specified as name:type:numeric-ranges, eg, col=Features:R4:1-17,26,35-40", | ||
ShortName = "col", SortOrder = 1)] | ||
public Column[] Column; | ||
|
||
[Argument(ArgumentType.AtMostOnce, HelpText = "Remove trailing whitespace from lines", ShortName = "trim")] | ||
public bool TrimWhitespace; | ||
public bool TrimWhitespace = DefaultArguments.TrimWhitespace; | ||
|
||
[Argument(ArgumentType.AtMostOnce, ShortName = "header", | ||
HelpText = "Data file has header with feature names. Header is read only if options 'hs' and 'hf' are not specified.")] | ||
|
@@ -392,6 +392,15 @@ public sealed class Arguments : ArgumentsCore | |
public long? MaxRows; | ||
} | ||
|
||
internal static class DefaultArguments | ||
{ | ||
internal const bool AllowQuoting = true; | ||
internal const bool AllowSparse = true; | ||
internal const char Separator = '\t'; | ||
internal const bool HasHeader = false; | ||
internal const bool TrimWhitespace = false; | ||
} | ||
|
||
/// <summary> | ||
/// Used as an input column range. | ||
/// A variable length segment (extending to the end of the input line) is represented by Lim == SrcLim. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,12 +2,16 @@ | |
// The .NET Foundation licenses this file to you under the MIT license. | ||
// See the LICENSE file in the project root for more information. | ||
|
||
using Microsoft.ML.Data; | ||
using Microsoft.ML.Runtime; | ||
using Microsoft.ML.Runtime.Data; | ||
using Microsoft.ML.Runtime.Data.IO; | ||
using System; | ||
using System.Collections.Generic; | ||
using System.IO; | ||
using static Microsoft.ML.Runtime.Data.TextLoader; | ||
using System.Linq; | ||
using System.Reflection; | ||
using System.Text.RegularExpressions; | ||
|
||
namespace Microsoft.ML | ||
{ | ||
|
@@ -22,7 +26,7 @@ public static class TextLoaderSaverCatalog | |
/// <param name="separatorChar">The character used as separator between data points in a row. By default the tab character is used as separator.</param> | ||
/// <param name="dataSample">The optional location of a data sample.</param> | ||
public static TextLoader CreateTextReader(this DataOperations catalog, | ||
Column[] columns, bool hasHeader = false, char separatorChar = '\t', IMultiStreamSource dataSample = null) | ||
TextLoader.Column[] columns, bool hasHeader = false, char separatorChar = '\t', IMultiStreamSource dataSample = null) | ||
=> new TextLoader(CatalogUtils.GetEnvironment(catalog), columns, hasHeader, separatorChar, dataSample); | ||
|
||
/// <summary> | ||
|
@@ -31,9 +35,86 @@ public static TextLoader CreateTextReader(this DataOperations catalog, | |
/// <param name="catalog">The catalog.</param> | ||
/// <param name="args">Defines the settings of the load operation.</param> | ||
/// <param name="dataSample">Allows to expose items that can be used for reading.</param> | ||
public static TextLoader CreateTextReader(this DataOperations catalog, Arguments args, IMultiStreamSource dataSample = null) | ||
public static TextLoader CreateTextReader(this DataOperations catalog, TextLoader.Arguments args, IMultiStreamSource dataSample = null) | ||
=> new TextLoader(CatalogUtils.GetEnvironment(catalog), args, dataSample); | ||
|
||
/// <summary> | ||
/// Create a text reader <see cref="TextLoader"/>. | ||
/// </summary> | ||
/// <param name="catalog">The catalog.</param> | ||
/// <param name="hasHeader"></param> | ||
/// <param name="separator"></param> | ||
/// <param name="allowQuotedStrings"></param> | ||
/// <param name="supportSparse"></param> | ||
/// <param name="trimWhitespace"></param> | ||
public static TextLoader CreateTextReader<TInput>(this DataOperations catalog, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
This entire code should move out of the catalog and into a static method of |
||
bool hasHeader = TextLoader.DefaultArguments.HasHeader, | ||
char separator = TextLoader.DefaultArguments.Separator, | ||
bool allowQuotedStrings = TextLoader.DefaultArguments.AllowQuoting, | ||
bool supportSparse = TextLoader.DefaultArguments.AllowSparse, | ||
bool trimWhitespace = TextLoader.DefaultArguments.TrimWhitespace) | ||
{ | ||
var userType = typeof(TInput); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you describe what this function is doing? I'd like to see something |
||
|
||
var fieldInfos = userType.GetFields(BindingFlags.Public | BindingFlags.Instance); | ||
|
||
var propertyInfos = | ||
userType | ||
.GetProperties(BindingFlags.Public | BindingFlags.Instance) | ||
.Where(x => x.CanRead && x.CanWrite && x.GetGetMethod() != null && x.GetSetMethod() != null && x.GetIndexParameters().Length == 0); | ||
|
||
var memberInfos = (fieldInfos as IEnumerable<MemberInfo>).Concat(propertyInfos).ToArray(); | ||
|
||
var columns = new TextLoader.Column[memberInfos.Length]; | ||
|
||
for (int index = 0; index < memberInfos.Length; index++) | ||
{ | ||
var memberInfo = memberInfos[index]; | ||
var mappingAttr = memberInfo.GetCustomAttribute<LoadColumnAttribute>(); | ||
var mptr = memberInfo.GetCustomAttributes(); | ||
|
||
Contracts.Assert(mappingAttr != null, $"Field or property {memberInfo.Name} is missing the LoadColumn attribute"); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
maybe There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Resolving the questions, since the discussion has moved to the other comment. In reply to: 243040913 [](ancestors = 243040913,242670646) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
it should be |
||
|
||
var column = new TextLoader.Column(); | ||
column.Name = mappingAttr.Name ?? memberInfo.Name; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
check for |
||
column.Source = mappingAttr.Sources.ToArray(); | ||
DataKind dk; | ||
switch (memberInfo) | ||
{ | ||
case FieldInfo field: | ||
if (!DataKindExtensions.TryGetDataKind(field.FieldType.IsArray ? field.FieldType.GetElementType() : field.FieldType, out dk)) | ||
throw Contracts.Except($"Field {memberInfo.Name} is of unsupported type."); | ||
|
||
break; | ||
|
||
case PropertyInfo property: | ||
if (!DataKindExtensions.TryGetDataKind(property.PropertyType.IsArray ? property.PropertyType.GetElementType() : property.PropertyType, out dk)) | ||
throw Contracts.Except($"Property {memberInfo.Name} is of unsupported type."); | ||
break; | ||
|
||
default: | ||
Contracts.Assert(false); | ||
throw Contracts.ExceptNotSupp("Expected a FieldInfo or a PropertyInfo"); | ||
} | ||
|
||
column.Type = dk; | ||
|
||
columns[index] = column; | ||
} | ||
|
||
TextLoader.Arguments args = new TextLoader.Arguments | ||
{ | ||
HasHeader = hasHeader, | ||
SeparatorChars = new[] { separator }, | ||
AllowQuoting = allowQuotedStrings, | ||
AllowSparse = supportSparse, | ||
TrimWhitespace = trimWhitespace, | ||
Column = columns | ||
}; | ||
|
||
return new TextLoader(CatalogUtils.GetEnvironment(catalog), args); | ||
} | ||
|
||
/// <summary> | ||
/// Read a data view from a text file using <see cref="TextLoader"/>. | ||
/// </summary> | ||
|
@@ -44,7 +125,7 @@ public static TextLoader CreateTextReader(this DataOperations catalog, Arguments | |
/// <param name="path">The path to the file.</param> | ||
/// <returns>The data view.</returns> | ||
public static IDataView ReadFromTextFile(this DataOperations catalog, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
we should also add a templated version of |
||
string path, Column[] columns, bool hasHeader = false, char separatorChar = '\t') | ||
string path, TextLoader.Column[] columns, bool hasHeader = false, char separatorChar = '\t') | ||
{ | ||
Contracts.CheckNonEmpty(path, nameof(path)); | ||
|
||
|
@@ -62,7 +143,7 @@ public static IDataView ReadFromTextFile(this DataOperations catalog, | |
/// <param name="catalog">The catalog.</param> | ||
/// <param name="path">Specifies a file from which to read.</param> | ||
/// <param name="args">Defines the settings of the load operation.</param> | ||
public static IDataView ReadFromTextFile(this DataOperations catalog, string path, Arguments args = null) | ||
public static IDataView ReadFromTextFile(this DataOperations catalog, string path, TextLoader.Arguments args = null) | ||
{ | ||
Contracts.CheckNonEmpty(path, nameof(path)); | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This attribute should go next to
TextLoader
, not as part of this class. #Closed