-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Make text loaders consistent #2710
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,17 +18,31 @@ public static class TextLoaderSaverCatalog | |
/// <param name="columns">Array of columns <see cref="TextLoader.Column"/> defining the schema.</param> | ||
/// <param name="separatorChar">The character used as separator between data points in a row. By default the tab character is used as separator.</param> | ||
/// <param name="hasHeader">Whether the file has a header.</param> | ||
/// <param name="allowSparse">Whether the file can contain numerical vectors in sparse format.</param> | ||
/// <param name="allowQuoting">Whether the file can contain column defined by a quoted string.</param> | ||
/// <param name="dataSample">The optional location of a data sample. The sample can be used to infer column names and number of slots in each column.</param> | ||
/// <param name="allowQuoting">Whether the file can contain column defined by a quoted string.</param> | ||
/// <param name="trimWhitespace">Remove trailing whitespace from lines</param> | ||
/// <param name="allowSparse">Whether the file can contain numerical vectors in sparse format.</param> | ||
public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog, | ||
TextLoader.Column[] columns, | ||
char separatorChar = TextLoader.Defaults.Separator, | ||
bool hasHeader = TextLoader.Defaults.HasHeader, | ||
bool allowSparse = TextLoader.Defaults.AllowSparse, | ||
IMultiStreamSource dataSample = null, | ||
bool allowQuoting = TextLoader.Defaults.AllowQuoting, | ||
IMultiStreamSource dataSample = null) | ||
=> new TextLoader(CatalogUtils.GetEnvironment(catalog), columns, separatorChar, hasHeader, allowSparse, allowQuoting, dataSample); | ||
bool trimWhitespace = TextLoader.Defaults.TrimWhitespace, | ||
bool allowSparse = TextLoader.Defaults.AllowSparse) | ||
{ | ||
var options = new TextLoader.Options | ||
{ | ||
Columns = columns, | ||
Separators = new[] { separatorChar }, | ||
HasHeader = hasHeader, | ||
AllowQuoting = allowQuoting, | ||
TrimWhitespace = trimWhitespace, | ||
AllowSparse = allowSparse | ||
}; | ||
|
||
return new TextLoader(CatalogUtils.GetEnvironment(catalog), options: options, dataSample: dataSample); | ||
} | ||
|
||
/// <summary> | ||
/// Create a text loader <see cref="TextLoader"/>. | ||
|
@@ -47,79 +61,98 @@ public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog, | |
/// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param> | ||
/// <param name="separatorChar">Column separator character. Default is '\t'</param> | ||
/// <param name="hasHeader">Does the file contains header?</param> | ||
/// <param name="dataSample">The optional location of a data sample. The sample can be used to infer column names and number of slots in each column.</param> | ||
/// <param name="allowQuoting">Whether the input may include quoted values, | ||
/// which can contain separator characters, colons, | ||
/// and distinguish empty values from missing values. When true, consecutive separators | ||
/// denote a missing value and an empty value is denoted by \"\". | ||
/// When false, consecutive separators denote an empty value.</param> | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you could make the comments consistent as well it would be great. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
/// <param name="trimWhitespace">Remove trailing whitespace from lines</param> | ||
/// <param name="allowSparse">Whether the input may include sparse representations for example, | ||
/// if one of the row contains "5 2:6 4:3" that's mean there are 5 columns all zero | ||
/// except for 3rd and 5th columns which have values 6 and 3</param> | ||
/// <param name="trimWhitespace">Remove trailing whitespace from lines</param> | ||
public static TextLoader CreateTextLoader<TInput>(this DataOperationsCatalog catalog, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
This is not consistent. The file is missing altogether, which means it cannot be used to read slot names. Whoops. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok. I wrongly thought TInput can be used to encode SlotNames. In reply to: 259963380 [](ancestors = 259963380) |
||
char separatorChar = TextLoader.Defaults.Separator, | ||
bool hasHeader = TextLoader.Defaults.HasHeader, | ||
IMultiStreamSource dataSample = null, | ||
bool allowQuoting = TextLoader.Defaults.AllowQuoting, | ||
bool allowSparse = TextLoader.Defaults.AllowSparse, | ||
bool trimWhitespace = TextLoader.Defaults.TrimWhitespace) | ||
=> TextLoader.CreateTextReader<TInput>(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar, allowQuoting, allowSparse, trimWhitespace); | ||
bool trimWhitespace = TextLoader.Defaults.TrimWhitespace, | ||
bool allowSparse = TextLoader.Defaults.AllowSparse) | ||
=> TextLoader.CreateTextReader<TInput>(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar, allowQuoting, | ||
allowSparse, trimWhitespace, dataSample: dataSample); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
why pass the arguments this way? Why not add dataSample after hasHeader? |
||
|
||
/// <summary> | ||
/// Read a data view from a text file using <see cref="TextLoader"/>. | ||
/// </summary> | ||
/// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param> | ||
/// <param name="path">The path to the file.</param> | ||
/// <param name="columns">The columns of the schema.</param> | ||
/// <param name="hasHeader">Whether the file has a header.</param> | ||
/// <param name="separatorChar">The character used as separator between data points in a row. By default the tab character is used as separator.</param> | ||
/// <param name="path">The path to the file.</param> | ||
/// <param name="hasHeader">Whether the file has a header.</param> | ||
/// <param name="dataSample">The optional location of a data sample. The sample can be used to infer column names and number of slots in each column.</param> | ||
/// <param name="allowQuoting">Whether the file can contain column defined by a quoted string.</param> | ||
/// <param name="trimWhitespace">Remove trailing whitespace from lines</param> | ||
/// <param name="allowSparse">Whether the file can contain numerical vectors in sparse format.</param> | ||
/// <returns>The data view.</returns> | ||
public static IDataView ReadFromTextFile(this DataOperationsCatalog catalog, | ||
string path, | ||
TextLoader.Column[] columns, | ||
char separatorChar = TextLoader.Defaults.Separator, | ||
bool hasHeader = TextLoader.Defaults.HasHeader) | ||
bool hasHeader = TextLoader.Defaults.HasHeader, | ||
IMultiStreamSource dataSample = null, | ||
bool allowQuoting = TextLoader.Defaults.AllowQuoting, | ||
bool trimWhitespace = TextLoader.Defaults.TrimWhitespace, | ||
bool allowSparse = TextLoader.Defaults.AllowSparse) | ||
{ | ||
Contracts.CheckNonEmpty(path, nameof(path)); | ||
|
||
var env = catalog.GetEnvironment(); | ||
var options = new TextLoader.Options | ||
{ | ||
Columns = columns, | ||
Separators = new[] { separatorChar }, | ||
HasHeader = hasHeader, | ||
AllowQuoting = allowQuoting, | ||
TrimWhitespace = trimWhitespace, | ||
AllowSparse = allowSparse | ||
}; | ||
|
||
// REVIEW: it is almost always a mistake to have a 'trainable' text loader here. | ||
// Therefore, we are going to disallow data sample. | ||
var reader = new TextLoader(env, columns, separatorChar, hasHeader, dataSample: null); | ||
var reader = new TextLoader(CatalogUtils.GetEnvironment(catalog), options: options, dataSample: dataSample); | ||
return reader.Read(new MultiFileSource(path)); | ||
} | ||
|
||
/// <summary> | ||
/// Read a data view from a text file using <see cref="TextLoader"/>. | ||
/// </summary> | ||
/// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param> | ||
/// <param name="hasHeader">Does the file contains header?</param> | ||
/// <param name="path">The path to the file.</param> | ||
/// <param name="separatorChar">Column separator character. Default is '\t'</param> | ||
/// <param name="hasHeader">Does the file contains header?</param> | ||
/// <param name="dataSample">The optional location of a data sample. The sample can be used to infer column names and number of slots in each column.</param> | ||
/// <param name="allowQuoting">Whether the input may include quoted values, | ||
/// which can contain separator characters, colons, | ||
/// and distinguish empty values from missing values. When true, consecutive separators | ||
/// denote a missing value and an empty value is denoted by \"\". | ||
/// When false, consecutive separators denote an empty value.</param> | ||
/// <param name="trimWhitespace">Remove trailing whitespace from lines</param> | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. minor nit - Add a period. |
||
/// <param name="allowSparse">Whether the input may include sparse representations for example, | ||
/// if one of the row contains "5 2:6 4:3" that's mean there are 5 columns all zero | ||
/// except for 3rd and 5th columns which have values 6 and 3</param> | ||
/// <param name="trimWhitespace">Remove trailing whitespace from lines</param> | ||
/// <param name="path">The path to the file.</param> | ||
/// <returns>The data view.</returns> | ||
public static IDataView ReadFromTextFile<TInput>(this DataOperationsCatalog catalog, | ||
string path, | ||
char separatorChar = TextLoader.Defaults.Separator, | ||
bool hasHeader = TextLoader.Defaults.HasHeader, | ||
IMultiStreamSource dataSample = null, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Similar here. |
||
bool allowQuoting = TextLoader.Defaults.AllowQuoting, | ||
bool allowSparse = TextLoader.Defaults.AllowSparse, | ||
bool trimWhitespace = TextLoader.Defaults.TrimWhitespace) | ||
bool trimWhitespace = TextLoader.Defaults.TrimWhitespace, | ||
bool allowSparse = TextLoader.Defaults.AllowSparse) | ||
{ | ||
Contracts.CheckNonEmpty(path, nameof(path)); | ||
|
||
// REVIEW: it is almost always a mistake to have a 'trainable' text loader here. | ||
// Therefore, we are going to disallow data sample. | ||
return TextLoader.CreateTextReader<TInput>(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar, allowQuoting, allowSparse, trimWhitespace) | ||
.Read(new MultiFileSource(path)); | ||
return TextLoader.CreateTextReader<TInput>(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar, | ||
allowQuoting, allowSparse, trimWhitespace, dataSample: dataSample).Read(new MultiFileSource(path)); | ||
} | ||
|
||
/// <summary> | ||
|
@@ -128,14 +161,19 @@ public static IDataView ReadFromTextFile<TInput>(this DataOperationsCatalog cata | |
/// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param> | ||
/// <param name="path">Specifies a file from which to read.</param> | ||
/// <param name="options">Defines the settings of the load operation.</param> | ||
public static IDataView ReadFromTextFile(this DataOperationsCatalog catalog, string path, TextLoader.Options options = null) | ||
/// <param name="dataSample">The optional location of a data sample. The sample can be used to infer column names and number of slots in each column.</param> | ||
public static IDataView ReadFromTextFile(this DataOperationsCatalog catalog, string path, | ||
TextLoader.Options options = null, IMultiStreamSource dataSample = null) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Hi @wschin, somehow I missed this -- definitely we don't need to have a data sample if we're talking about the path to a data file already. |
||
{ | ||
Contracts.CheckNonEmpty(path, nameof(path)); | ||
|
||
var env = catalog.GetEnvironment(); | ||
var source = new MultiFileSource(path); | ||
|
||
return new TextLoader(env, options, source).Read(source); | ||
if (dataSample == null) | ||
return new TextLoader(env, options, source).Read(source); | ||
else | ||
return new TextLoader(env, options, dataSample).Read(source); | ||
} | ||
|
||
/// <summary> | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit pick - I would rephrase this as "Set to true if the file contains a header row".