Skip to content

Make text loaders consistent #2710

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Feb 26, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 5 additions & 28 deletions src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1085,30 +1085,6 @@ private bool HasHeader
private readonly IHost _host;
private const string RegistrationName = "TextLoader";

/// <summary>
/// Loads a text file into an <see cref="IDataView"/>. Supports basic mapping from input columns to IDataView columns.
/// </summary>
/// <param name="env">The environment to use.</param>
/// <param name="columns">Defines a mapping between input columns in the file and IDataView columns.</param>
/// <param name="separatorChar"> The character used as separator between data points in a row. By default the tab character is used as separator.</param>
/// <param name="hasHeader">Whether the file has a header.</param>
/// <param name="allowSparse">Whether the file can contain numerical vectors in sparse format.</param>
/// <param name="allowQuoting">Whether the content of a column can be parsed from a string starting and ending with quote.</param>
/// <param name="dataSample">Allows to expose items that can be used for reading.</param>
internal TextLoader(IHostEnvironment env, Column[] columns, char separatorChar = Defaults.Separator,
bool hasHeader = Defaults.HasHeader, bool allowSparse = Defaults.AllowSparse,
bool allowQuoting = Defaults.AllowQuoting, IMultiStreamSource dataSample = null)
: this(env, MakeArgs(columns, hasHeader, new[] { separatorChar }, allowSparse, allowQuoting), dataSample)
{
}

private static Options MakeArgs(Column[] columns, bool hasHeader, char[] separatorChars, bool allowSparse, bool allowQuoting)
{
Contracts.AssertValue(separatorChars);
var result = new Options { Columns = columns, HasHeader = hasHeader, Separators = separatorChars, AllowSparse = allowSparse, AllowQuoting = allowQuoting };
return result;
}

/// <summary>
/// Loads a text file into an <see cref="IDataView"/>. Supports basic mapping from input columns to IDataView columns.
/// </summary>
Expand Down Expand Up @@ -1462,9 +1438,10 @@ void ICanSaveModel.Save(ModelSaveContext ctx)
internal static TextLoader CreateTextReader<TInput>(IHostEnvironment host,
bool hasHeader = Defaults.HasHeader,
char separator = Defaults.Separator,
bool allowQuotedStrings = Defaults.AllowQuoting,
bool allowQuoting = Defaults.AllowQuoting,
bool supportSparse = Defaults.AllowSparse,
bool trimWhitespace = Defaults.TrimWhitespace)
bool trimWhitespace = Defaults.TrimWhitespace,
IMultiStreamSource dataSample = null)
{
var userType = typeof(TInput);

Expand Down Expand Up @@ -1519,13 +1496,13 @@ internal static TextLoader CreateTextReader<TInput>(IHostEnvironment host,
{
HasHeader = hasHeader,
Separators = new[] { separator },
AllowQuoting = allowQuotedStrings,
AllowQuoting = allowQuoting,
AllowSparse = supportSparse,
TrimWhitespace = trimWhitespace,
Columns = columns.ToArray()
};

return new TextLoader(host, options);
return new TextLoader(host, options, dataSample: dataSample);
}

private sealed class BoundLoader : IDataLoader
Expand Down
88 changes: 63 additions & 25 deletions src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs
Original file line number Diff line number Diff line change
Expand Up @@ -18,17 +18,31 @@ public static class TextLoaderSaverCatalog
/// <param name="columns">Array of columns <see cref="TextLoader.Column"/> defining the schema.</param>
/// <param name="separatorChar">The character used as separator between data points in a row. By default the tab character is used as separator.</param>
/// <param name="hasHeader">Whether the file has a header.</param>
/// <param name="allowSparse">Whether the file can contain numerical vectors in sparse format.</param>
/// <param name="allowQuoting">Whether the file can contain column defined by a quoted string.</param>
/// <param name="dataSample">The optional location of a data sample. The sample can be used to infer column names and number of slots in each column.</param>
/// <param name="allowQuoting">Whether the file can contain column defined by a quoted string.</param>
/// <param name="trimWhitespace">Remove trailing whitespace from lines</param>
/// <param name="allowSparse">Whether the file can contain numerical vectors in sparse format.</param>
public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog,
TextLoader.Column[] columns,
char separatorChar = TextLoader.Defaults.Separator,
bool hasHeader = TextLoader.Defaults.HasHeader,
bool allowSparse = TextLoader.Defaults.AllowSparse,
IMultiStreamSource dataSample = null,
bool allowQuoting = TextLoader.Defaults.AllowQuoting,
IMultiStreamSource dataSample = null)
=> new TextLoader(CatalogUtils.GetEnvironment(catalog), columns, separatorChar, hasHeader, allowSparse, allowQuoting, dataSample);
bool trimWhitespace = TextLoader.Defaults.TrimWhitespace,
bool allowSparse = TextLoader.Defaults.AllowSparse)
{
var options = new TextLoader.Options
{
Columns = columns,
Separators = new[] { separatorChar },
HasHeader = hasHeader,
AllowQuoting = allowQuoting,
TrimWhitespace = trimWhitespace,
AllowSparse = allowSparse
};

return new TextLoader(CatalogUtils.GetEnvironment(catalog), options: options, dataSample: dataSample);
}

/// <summary>
/// Create a text loader <see cref="TextLoader"/>.
Expand All @@ -47,79 +61,98 @@ public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog,
/// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param>
/// <param name="separatorChar">Column separator character. Default is '\t'</param>
/// <param name="hasHeader">Does the file contains header?</param>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hasHeader [](start = 25, length = 9)

Nit pick - I would rephrase this as "Set to true if the file contains a header row".

/// <param name="dataSample">The optional location of a data sample. The sample can be used to infer column names and number of slots in each column.</param>
/// <param name="allowQuoting">Whether the input may include quoted values,
/// which can contain separator characters, colons,
/// and distinguish empty values from missing values. When true, consecutive separators
/// denote a missing value and an empty value is denoted by \"\".
/// When false, consecutive separators denote an empty value.</param>
Copy link
Contributor

@artidoro artidoro Feb 25, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you could make the comments consistent as well it would be great.
#Resolved

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure.


In reply to: 259975940 [](ancestors = 259975940)

/// <param name="trimWhitespace">Remove trailing whitespace from lines</param>
/// <param name="allowSparse">Whether the input may include sparse representations for example,
/// if one of the row contains "5 2:6 4:3" that's mean there are 5 columns all zero
/// except for 3rd and 5th columns which have values 6 and 3</param>
/// <param name="trimWhitespace">Remove trailing whitespace from lines</param>
public static TextLoader CreateTextLoader<TInput>(this DataOperationsCatalog catalog,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CreateTextLoader [](start = 33, length = 16)

This is not consistent. The file is missing altogether, which means it cannot be used to read slot names. Whoops.

Copy link
Member Author

@wschin wschin Feb 25, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok. I wrongly thought TInput can be used to encode SlotNames. IMultiStreamSource dataSample = null is just added.


In reply to: 259963380 [](ancestors = 259963380)

char separatorChar = TextLoader.Defaults.Separator,
bool hasHeader = TextLoader.Defaults.HasHeader,
IMultiStreamSource dataSample = null,
bool allowQuoting = TextLoader.Defaults.AllowQuoting,
bool allowSparse = TextLoader.Defaults.AllowSparse,
bool trimWhitespace = TextLoader.Defaults.TrimWhitespace)
=> TextLoader.CreateTextReader<TInput>(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar, allowQuoting, allowSparse, trimWhitespace);
bool trimWhitespace = TextLoader.Defaults.TrimWhitespace,
bool allowSparse = TextLoader.Defaults.AllowSparse)
=> TextLoader.CreateTextReader<TInput>(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar, allowQuoting,
allowSparse, trimWhitespace, dataSample: dataSample);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dataSample [](start = 45, length = 10)

why pass the arguments this way? Why not add dataSample after hasHeader?


/// <summary>
/// Read a data view from a text file using <see cref="TextLoader"/>.
/// </summary>
/// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param>
/// <param name="path">The path to the file.</param>
/// <param name="columns">The columns of the schema.</param>
/// <param name="hasHeader">Whether the file has a header.</param>
/// <param name="separatorChar">The character used as separator between data points in a row. By default the tab character is used as separator.</param>
/// <param name="path">The path to the file.</param>
/// <param name="hasHeader">Whether the file has a header.</param>
/// <param name="dataSample">The optional location of a data sample. The sample can be used to infer column names and number of slots in each column.</param>
/// <param name="allowQuoting">Whether the file can contain column defined by a quoted string.</param>
/// <param name="trimWhitespace">Remove trailing whitespace from lines</param>
/// <param name="allowSparse">Whether the file can contain numerical vectors in sparse format.</param>
/// <returns>The data view.</returns>
public static IDataView ReadFromTextFile(this DataOperationsCatalog catalog,
string path,
TextLoader.Column[] columns,
char separatorChar = TextLoader.Defaults.Separator,
bool hasHeader = TextLoader.Defaults.HasHeader)
bool hasHeader = TextLoader.Defaults.HasHeader,
IMultiStreamSource dataSample = null,
bool allowQuoting = TextLoader.Defaults.AllowQuoting,
bool trimWhitespace = TextLoader.Defaults.TrimWhitespace,
bool allowSparse = TextLoader.Defaults.AllowSparse)
{
Contracts.CheckNonEmpty(path, nameof(path));

var env = catalog.GetEnvironment();
var options = new TextLoader.Options
{
Columns = columns,
Separators = new[] { separatorChar },
HasHeader = hasHeader,
AllowQuoting = allowQuoting,
TrimWhitespace = trimWhitespace,
AllowSparse = allowSparse
};

// REVIEW: it is almost always a mistake to have a 'trainable' text loader here.
// Therefore, we are going to disallow data sample.
var reader = new TextLoader(env, columns, separatorChar, hasHeader, dataSample: null);
var reader = new TextLoader(CatalogUtils.GetEnvironment(catalog), options: options, dataSample: dataSample);
return reader.Read(new MultiFileSource(path));
}

/// <summary>
/// Read a data view from a text file using <see cref="TextLoader"/>.
/// </summary>
/// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param>
/// <param name="hasHeader">Does the file contains header?</param>
/// <param name="path">The path to the file.</param>
/// <param name="separatorChar">Column separator character. Default is '\t'</param>
/// <param name="hasHeader">Does the file contains header?</param>
/// <param name="dataSample">The optional location of a data sample. The sample can be used to infer column names and number of slots in each column.</param>
/// <param name="allowQuoting">Whether the input may include quoted values,
/// which can contain separator characters, colons,
/// and distinguish empty values from missing values. When true, consecutive separators
/// denote a missing value and an empty value is denoted by \"\".
/// When false, consecutive separators denote an empty value.</param>
/// <param name="trimWhitespace">Remove trailing whitespace from lines</param>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

minor nit - Add a period.

/// <param name="allowSparse">Whether the input may include sparse representations for example,
/// if one of the row contains "5 2:6 4:3" that's mean there are 5 columns all zero
/// except for 3rd and 5th columns which have values 6 and 3</param>
/// <param name="trimWhitespace">Remove trailing whitespace from lines</param>
/// <param name="path">The path to the file.</param>
/// <returns>The data view.</returns>
public static IDataView ReadFromTextFile<TInput>(this DataOperationsCatalog catalog,
string path,
char separatorChar = TextLoader.Defaults.Separator,
bool hasHeader = TextLoader.Defaults.HasHeader,
IMultiStreamSource dataSample = null,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IMultiStreamSource dataSample [](start = 12, length = 29)

Similar here.

bool allowQuoting = TextLoader.Defaults.AllowQuoting,
bool allowSparse = TextLoader.Defaults.AllowSparse,
bool trimWhitespace = TextLoader.Defaults.TrimWhitespace)
bool trimWhitespace = TextLoader.Defaults.TrimWhitespace,
bool allowSparse = TextLoader.Defaults.AllowSparse)
{
Contracts.CheckNonEmpty(path, nameof(path));

// REVIEW: it is almost always a mistake to have a 'trainable' text loader here.
// Therefore, we are going to disallow data sample.
return TextLoader.CreateTextReader<TInput>(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar, allowQuoting, allowSparse, trimWhitespace)
.Read(new MultiFileSource(path));
return TextLoader.CreateTextReader<TInput>(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar,
allowQuoting, allowSparse, trimWhitespace, dataSample: dataSample).Read(new MultiFileSource(path));
}

/// <summary>
Expand All @@ -128,14 +161,19 @@ public static IDataView ReadFromTextFile<TInput>(this DataOperationsCatalog cata
/// <param name="catalog">The <see cref="DataOperationsCatalog"/> catalog.</param>
/// <param name="path">Specifies a file from which to read.</param>
/// <param name="options">Defines the settings of the load operation.</param>
public static IDataView ReadFromTextFile(this DataOperationsCatalog catalog, string path, TextLoader.Options options = null)
/// <param name="dataSample">The optional location of a data sample. The sample can be used to infer column names and number of slots in each column.</param>
public static IDataView ReadFromTextFile(this DataOperationsCatalog catalog, string path,
TextLoader.Options options = null, IMultiStreamSource dataSample = null)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

, IMultiStreamSource dataSample = null [](start = 45, length = 38)

Hi @wschin, somehow I missed this -- definitely we don't need to have a data sample if we're talking about the path to a data file already.

{
Contracts.CheckNonEmpty(path, nameof(path));

var env = catalog.GetEnvironment();
var source = new MultiFileSource(path);

return new TextLoader(env, options, source).Read(source);
if (dataSample == null)
return new TextLoader(env, options, source).Read(source);
else
return new TextLoader(env, options, dataSample).Read(source);
}

/// <summary>
Expand Down
17 changes: 13 additions & 4 deletions src/Microsoft.ML.Data/Transforms/ValueToKeyMappingTransformer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -439,10 +439,19 @@ internal static IDataView GetKeyDataViewOrNull(IHostEnvironment env, IChannel ch
"{0} should not be specified when default loader is " + nameof(TextLoader) + ". Ignoring {0}={1}",
nameof(Options.TermsColumn), src);
}
keyData = new TextLoader(env,
columns: new[] { new TextLoader.Column("Term", DataKind.String, 0) },
dataSample: fileSource)
.Read(fileSource);

// Create text loader.
var options = new TextLoader.Options()
{
Columns = new[]
{
new TextLoader.Column("Term", DataKind.String, 0)
}
};
var reader = new TextLoader(env, options: options, dataSample: fileSource);

keyData = reader.Read(fileSource);

src = "Term";
// In this case they are relying on heuristics, so auto-loading in this case is most appropriate.
autoConvert = true;
Expand Down
14 changes: 10 additions & 4 deletions src/Microsoft.ML.Transforms/Text/StopWordsRemovingTransformer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -737,13 +737,19 @@ private IDataLoader GetLoaderForStopwords(IChannel ch, string dataFile,
{
if (stopwordsCol == null)
stopwordsCol = "Stopwords";
dataLoader = new TextLoader(
Host,
columns: new[]

// Create text loader.
var options = new TextLoader.Options()
{
Columns = new[]
{
new TextLoader.Column(stopwordsCol, DataKind.String, 0)
},
dataSample: fileSource).Read(fileSource) as IDataLoader;
Separators = new[] { ',' },
};
var reader = new TextLoader(Host, options: options, dataSample: fileSource);

dataLoader = reader.Read(fileSource) as IDataLoader;
}
ch.AssertNonEmpty(stopwordsCol);
}
Expand Down
Loading