Skip to content

Make separator char[] everywhere (previous type is char sometime) #2702

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ public static void Example()
new TextLoader.Column("Words", DataKind.TX, 0),
new TextLoader.Column("Ids", DataKind.I4, 1),
},
separatorChar: ','
separatorChar: new[] { ',' }
);

// Load the TensorFlow model once.
Expand Down
24 changes: 13 additions & 11 deletions src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -428,13 +428,13 @@ public class Options

[Argument(ArgumentType.AtMostOnce, Visibility = ArgumentAttribute.VisibilityType.CmdLineOnly, HelpText = "Source column separator. Options: tab, space, comma, single character", ShortName = "sep")]
// this is internal as it only serves the command line interface
internal string Separator = Defaults.Separator.ToString();
internal string Separator = "\t";

/// <summary>
/// The characters that should be used as separators column separator.
/// </summary>
[Argument(ArgumentType.AtMostOnce, Name = nameof(Separator), Visibility = ArgumentAttribute.VisibilityType.EntryPointsOnly, HelpText = "Source column separator.", ShortName = "sep")]
public char[] Separators = new[] { Defaults.Separator };
public char[] Separators = Defaults.Separator;

/// <summary>
/// Specifies the input columns that should be mapped to <see cref="IDataView"/> columns.
Expand Down Expand Up @@ -488,7 +488,7 @@ internal static class Defaults
{
internal const bool AllowQuoting = false;
internal const bool AllowSparse = false;
internal const char Separator = '\t';
internal static char[] Separator => new[] { '\t' };
internal const bool HasHeader = false;
internal const bool TrimWhitespace = false;
}
Expand Down Expand Up @@ -1069,10 +1069,10 @@ private bool HasHeader
/// <param name="allowSparse">Whether the file can contain numerical vectors in sparse format.</param>
/// <param name="allowQuoting">Whether the content of a column can be parsed from a string starting and ending with quote.</param>
/// <param name="dataSample">Allows to expose items that can be used for reading.</param>
internal TextLoader(IHostEnvironment env, Column[] columns, char separatorChar = Defaults.Separator,
internal TextLoader(IHostEnvironment env, Column[] columns, char[] separatorChar = null,
bool hasHeader = Defaults.HasHeader, bool allowSparse = Defaults.AllowSparse,
bool allowQuoting = Defaults.AllowQuoting, IMultiStreamSource dataSample = null)
: this(env, MakeArgs(columns, hasHeader, new[] { separatorChar }, allowSparse, allowQuoting), dataSample)
: this(env, MakeArgs(columns, hasHeader, separatorChar ?? Defaults.Separator, allowSparse, allowQuoting), dataSample)
{
}

Expand Down Expand Up @@ -1145,10 +1145,11 @@ internal TextLoader(IHostEnvironment env, Options options = null, IMultiStreamSo

_host.CheckNonEmpty(options.Separator, nameof(options.Separator), "Must specify a separator");

//Default arg.Separator is tab and default options. Separators is also a '\t'.
//At a time only one default can be different and whichever is different that will
//be used.
if (options.Separators.Length > 1 || options.Separators[0] != '\t')
// Default options.Separator is "\t" while default options.Separators is {'\t'}.
// We use options.Separators only if options.Separator is default and choose options.Seperators otherwise.
// The logic behind is that options.Separators has higher priority because it's a public API arguments, but
// options.Seperator is only for command line tool and entry points.
if (options.Separator == "\t")
{
var separators = new HashSet<char>();
foreach (char c in options.Separators)
Expand Down Expand Up @@ -1435,11 +1436,12 @@ void ICanSaveModel.Save(ModelSaveContext ctx)

internal static TextLoader CreateTextReader<TInput>(IHostEnvironment host,
bool hasHeader = Defaults.HasHeader,
char separator = Defaults.Separator,
char[] separator = null,
bool allowQuotedStrings = Defaults.AllowQuoting,
bool supportSparse = Defaults.AllowSparse,
bool trimWhitespace = Defaults.TrimWhitespace)
{
separator = separator ?? Defaults.Separator;
var userType = typeof(TInput);

var fieldInfos = userType.GetFields(BindingFlags.Public | BindingFlags.Instance);
Expand Down Expand Up @@ -1492,7 +1494,7 @@ internal static TextLoader CreateTextReader<TInput>(IHostEnvironment host,
Options options = new Options
{
HasHeader = hasHeader,
Separators = new[] { separator },
Separators = separator,
AllowQuoting = allowQuotedStrings,
AllowSparse = supportSparse,
TrimWhitespace = trimWhitespace,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,12 @@ public static class TextLoaderSaverCatalog
/// <param name="dataSample">The optional location of a data sample. The sample can be used to infer column names and number of slots in each column.</param>
public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog,
TextLoader.Column[] columns,
char separatorChar = TextLoader.Defaults.Separator,
char[] separatorChar = null,
bool hasHeader = TextLoader.Defaults.HasHeader,
bool allowSparse = TextLoader.Defaults.AllowSparse,
bool allowQuoting = TextLoader.Defaults.AllowQuoting,
IMultiStreamSource dataSample = null)
=> new TextLoader(CatalogUtils.GetEnvironment(catalog), columns, separatorChar, hasHeader, allowSparse, allowQuoting, dataSample);
=> new TextLoader(CatalogUtils.GetEnvironment(catalog), columns, separatorChar ?? TextLoader.Defaults.Separator, hasHeader, allowSparse, allowQuoting, dataSample);

/// <summary>
/// Create a text loader <see cref="TextLoader"/>.
Expand Down Expand Up @@ -57,12 +57,12 @@ public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog,
/// except for 3rd and 5th columns which have values 6 and 3</param>
/// <param name="trimWhitespace">Remove trailing whitespace from lines</param>
public static TextLoader CreateTextLoader<TInput>(this DataOperationsCatalog catalog,
char separatorChar = TextLoader.Defaults.Separator,
char[] separatorChar = null,
bool hasHeader = TextLoader.Defaults.HasHeader,
bool allowQuoting = TextLoader.Defaults.AllowQuoting,
bool allowSparse = TextLoader.Defaults.AllowSparse,
bool trimWhitespace = TextLoader.Defaults.TrimWhitespace)
=> TextLoader.CreateTextReader<TInput>(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar, allowQuoting, allowSparse, trimWhitespace);
=> TextLoader.CreateTextReader<TInput>(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar ?? TextLoader.Defaults.Separator, allowQuoting, allowSparse, trimWhitespace);

/// <summary>
/// Read a data view from a text file using <see cref="TextLoader"/>.
Expand All @@ -76,7 +76,7 @@ public static TextLoader CreateTextLoader<TInput>(this DataOperationsCatalog cat
public static IDataView ReadFromTextFile(this DataOperationsCatalog catalog,
string path,
TextLoader.Column[] columns,
char separatorChar = TextLoader.Defaults.Separator,
char[] separatorChar = null,
bool hasHeader = TextLoader.Defaults.HasHeader)
{
Contracts.CheckNonEmpty(path, nameof(path));
Expand All @@ -85,7 +85,7 @@ public static IDataView ReadFromTextFile(this DataOperationsCatalog catalog,

// REVIEW: it is almost always a mistake to have a 'trainable' text loader here.
// Therefore, we are going to disallow data sample.
var reader = new TextLoader(env, columns, separatorChar, hasHeader, dataSample: null);
var reader = new TextLoader(env, columns, separatorChar ?? TextLoader.Defaults.Separator, hasHeader, dataSample: null);
return reader.Read(new MultiFileSource(path));
}

Expand All @@ -108,7 +108,7 @@ public static IDataView ReadFromTextFile(this DataOperationsCatalog catalog,
/// <returns>The data view.</returns>
public static IDataView ReadFromTextFile<TInput>(this DataOperationsCatalog catalog,
string path,
char separatorChar = TextLoader.Defaults.Separator,
char[] separatorChar = null,
bool hasHeader = TextLoader.Defaults.HasHeader,
bool allowQuoting = TextLoader.Defaults.AllowQuoting,
bool allowSparse = TextLoader.Defaults.AllowSparse,
Expand All @@ -118,8 +118,8 @@ public static IDataView ReadFromTextFile<TInput>(this DataOperationsCatalog cata

// REVIEW: it is almost always a mistake to have a 'trainable' text loader here.
// Therefore, we are going to disallow data sample.
return TextLoader.CreateTextReader<TInput>(CatalogUtils.GetEnvironment(catalog), hasHeader, separatorChar, allowQuoting, allowSparse, trimWhitespace)
.Read(new MultiFileSource(path));
return TextLoader.CreateTextReader<TInput>(CatalogUtils.GetEnvironment(catalog), hasHeader,
separatorChar ?? TextLoader.Defaults.Separator, allowQuoting, allowSparse, trimWhitespace).Read(new MultiFileSource(path));
}

/// <summary>
Expand Down
2 changes: 1 addition & 1 deletion src/Microsoft.ML.SamplesUtils/SamplesDatasetUtils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ public static IDataView LoadFeaturizedAdultDataset(MLContext mlContext)
new TextLoader.Column("native-country", DataKind.R4, 13),
new TextLoader.Column("IsOver50K", DataKind.BL, 14),
},
separatorChar: ',',
separatorChar: new[] { ',' },
hasHeader: true
);

Expand Down
2 changes: 1 addition & 1 deletion test/Microsoft.ML.Functional.Tests/DataIO.cs
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ public void WriteToAndReadASchemaFromADelimitedFile()
{
// Serialize a dataset with a known schema to a file.
var filePath = SerializeDatasetToFile(mlContext, dataBefore, separator);
var dataAfter = mlContext.Data.ReadFromTextFile<TypeTestData>(filePath, separatorChar: separator, hasHeader: true, allowQuoting: true);
var dataAfter = mlContext.Data.ReadFromTextFile<TypeTestData>(filePath, separatorChar: new[] { separator }, hasHeader: true, allowQuoting: true);
Common.AssertTestTypeDatasetsAreEqual(mlContext, dataBefore, dataAfter);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ public static TextLoader GetTextLoader(MLContext mlContext, char separator)
new TextLoader.Column("Ug", DataKind.UG, 15),
new TextLoader.Column("Features", DataKind.R4, 16, 16 + _numFeatures-1),
},
separatorChar: separator,
separatorChar: new[] { separator },
hasHeader: true,
allowQuoting: true);
}
Expand Down
19 changes: 8 additions & 11 deletions test/Microsoft.ML.Tests/OnnxConversionTest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,7 @@ public void SimpleEndToEndOnnxConversionTest()
var trainDataPath = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename);
var mlContext = new MLContext(seed: 1, conc: 1);
var data = mlContext.Data.ReadFromTextFile<AdultData>(trainDataPath,
separatorChar: ';'
,
separatorChar: new[] { ';' },
hasHeader: true);
var cachedTrainData = mlContext.Data.Cache(data);
var dynamicPipeline =
Expand Down Expand Up @@ -129,7 +128,7 @@ public void KmeansOnnxConversionTest()
string dataPath = GetDataPath("breast-cancer.txt");
// Now read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed).
var data = mlContext.Data.ReadFromTextFile<BreastCancerFeatureVector>(dataPath,
separatorChar: '\t',
separatorChar: new[] { '\t' },
hasHeader: true);

var pipeline = mlContext.Transforms.Normalize("Features").
Expand Down Expand Up @@ -207,7 +206,7 @@ public void KeyToVectorWithBagOnnxConversionTest()
string dataPath = GetDataPath("breast-cancer.txt");

var data = mlContext.Data.ReadFromTextFile<BreastCancerCatFeatureExample>(dataPath,
separatorChar: '\t',
separatorChar: new[] { '\t' },
hasHeader: true);

var pipeline = mlContext.Transforms.Categorical.OneHotEncoding("F2", "F2", Transforms.Categorical.OneHotEncodingTransformer.OutputKind.Bag)
Expand Down Expand Up @@ -305,8 +304,7 @@ public void LogisticRegressionOnnxConversionTest()
var trainDataPath = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename);
var mlContext = new MLContext(seed: 1, conc: 1);
var data = mlContext.Data.ReadFromTextFile<AdultData>(trainDataPath,
separatorChar: ';'
,
separatorChar: new[] { ';' },
hasHeader: true);
var cachedTrainData = mlContext.Data.Cache(data);
var dynamicPipeline =
Expand Down Expand Up @@ -338,8 +336,7 @@ public void LightGbmBinaryClassificationOnnxConversionTest()
var trainDataPath = GetDataPath(TestDatasets.generatedRegressionDataset.trainFilename);
var mlContext = new MLContext(seed: 1, conc: 1);
var data = mlContext.Data.ReadFromTextFile<AdultData>(trainDataPath,
separatorChar: ';'
,
separatorChar: new[] { ';' },
hasHeader: true);
var cachedTrainData = mlContext.Data.Cache(data);
var dynamicPipeline =
Expand Down Expand Up @@ -371,7 +368,7 @@ public void MulticlassLogisticRegressionOnnxConversionTest()

string dataPath = GetDataPath("breast-cancer.txt");
var data = mlContext.Data.ReadFromTextFile<BreastCancerMulticlassExample>(dataPath,
separatorChar: '\t',
separatorChar: new[] { '\t' },
hasHeader: true);

var pipeline = mlContext.Transforms.Normalize("Features").
Expand Down Expand Up @@ -401,7 +398,7 @@ public void RemoveVariablesInPipelineTest()

string dataPath = GetDataPath("breast-cancer.txt");
var data = mlContext.Data.ReadFromTextFile<BreastCancerCatFeatureExample>(dataPath,
separatorChar: '\t',
separatorChar: new[] { '\t' },
hasHeader: true);

var pipeline = mlContext.Transforms.Categorical.OneHotEncoding("F2", "F2", Transforms.Categorical.OneHotEncodingTransformer.OutputKind.Bag)
Expand Down Expand Up @@ -452,7 +449,7 @@ public void WordEmbeddingsTest()
var mlContext = new MLContext(seed: 1, conc: 1);
var dataPath = GetDataPath(@"small-sentiment-test.tsv");
var embedNetworkPath = GetDataPath(@"shortsentiment.emd");
var data = mlContext.Data.ReadFromTextFile<SmallSentimentExample>(dataPath, separatorChar: '\t', hasHeader: false);
var data = mlContext.Data.ReadFromTextFile<SmallSentimentExample>(dataPath, separatorChar: new[] { '\t' }, hasHeader: false);

var pipeline = mlContext.Transforms.Text.ExtractWordEmbeddings("Embed", embedNetworkPath, "Tokens");
var model = pipeline.Fit(data);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,7 @@ private void TrainRegression(string trainDataPath, string testDataPath, string m
// Read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed).
var trainData = mlContext.Data.ReadFromTextFile<AdultData>(trainDataPath,
// Default separator is tab, but we need a semicolon.
separatorChar: ';'
,
separatorChar: new[] { ';' },
// First line of the file is a header, not a data row.
hasHeader: true);

Expand Down Expand Up @@ -115,8 +114,7 @@ private void TrainRegression(string trainDataPath, string testDataPath, string m
// Read the test dataset.
var testData = mlContext.Data.ReadFromTextFile<AdultData>(testDataPath,
// Default separator is tab, but we need a semicolon.
separatorChar: ';'
,
separatorChar: new[] { ';' },
// First line of the file is a header, not a data row.
hasHeader: true);

Expand Down Expand Up @@ -152,7 +150,7 @@ private ITransformer TrainOnIris(string irisDataPath)
// Retrieve the training data.
var trainData = mlContext.Data.ReadFromTextFile<IrisInput>(irisDataPath,
// Default separator is tab, but the dataset has comma.
separatorChar: ','
separatorChar: new[] { ',' }
);

//Preview the data
Expand Down Expand Up @@ -237,7 +235,7 @@ private void NormalizationWorkout(string dataPath)
// Read the training data.
var trainData = mlContext.Data.ReadFromTextFile<IrisInputAllFeatures>(dataPath,
// Default separator is tab, but the dataset has comma.
separatorChar: ','
separatorChar: new[] { ',' }
);

// Apply all kinds of standard ML.NET normalization to the raw features.
Expand Down Expand Up @@ -409,7 +407,7 @@ private void CrossValidationOn(string dataPath)
// Step one: read the data as an IDataView.
var data = mlContext.Data.ReadFromTextFile<IrisInput>(dataPath,
// Default separator is tab, but the dataset has comma.
separatorChar: ','
separatorChar: new[] { ',' }
);

// Build the training pipeline.
Expand Down Expand Up @@ -458,7 +456,7 @@ private void ReadDataDynamic(string dataPath)
// Now read the file (remember though, readers are lazy, so the actual reading will happen when the data is accessed).
var reader = mlContext.Data.ReadFromTextFile<AdultData>(dataPath,
// Default separator is tab, but we need a comma.
separatorChar: ',');
separatorChar: new[] { ',' });
}

// Define a class for all the input columns that we intend to consume.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ void DecomposableTrainAndPredict()
var dataPath = GetDataPath(TestDatasets.irisData.trainFilename);
var ml = new MLContext();

var data = ml.Data.ReadFromTextFile<IrisData>(dataPath, separatorChar: ',');
var data = ml.Data.ReadFromTextFile<IrisData>(dataPath, separatorChar: new[] { ',' });

var pipeline = new ColumnConcatenatingEstimator (ml, "Features", "SepalLength", "SepalWidth", "PetalLength", "PetalWidth")
.Append(new ValueToKeyMappingEstimator(ml, "Label"), TransformerScope.TrainTest)
Expand All @@ -39,7 +39,7 @@ void DecomposableTrainAndPredict()
var model = pipeline.Fit(data).GetModelFor(TransformerScope.Scoring);
var engine = model.CreatePredictionEngine<IrisDataNoLabel, IrisPrediction>(ml);

var testLoader = ml.Data.ReadFromTextFile(dataPath, TestDatasets.irisData.GetLoaderColumns(), separatorChar: ',', hasHeader: true);
var testLoader = ml.Data.ReadFromTextFile(dataPath, TestDatasets.irisData.GetLoaderColumns(), separatorChar: new[] { ',' }, hasHeader: true);
var testData = ml.CreateEnumerable<IrisData>(testLoader, false);
foreach (var input in testData.Take(20))
{
Expand Down
Loading