Skip to content

Enable TextLoader to accept new lines in quoted fields #5125

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 32 commits into from
May 19, 2020
Merged
Show file tree
Hide file tree
Changes from 29 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
e03b0e4
fix textloader bug on quotes
LittleLittleCloud Dec 17, 2019
3ddb3b0
use static method instead of extension method
LittleLittleCloud Dec 19, 2019
9b20ead
better name
LittleLittleCloud Dec 19, 2019
34d9efa
Merge branch 'master' into u/xiaoyun/fixTextLoaderBug
LittleLittleCloud Apr 15, 2020
ac6f971
Merge remote-tracking branch 'xiaoyun/u/xiaoyun/fixTextLoaderBug' int…
antoniovs1029 May 11, 2020
8304d62
Moved multiline reader to TextLoaderCursor, and added return when mul…
antoniovs1029 May 13, 2020
a9e91e2
Fix issue with empty unquoted new lines
antoniovs1029 May 13, 2020
9cfaee1
Make ReadMultiLine a little bit more efficient
antoniovs1029 May 13, 2020
2827c61
Add clarifying comment in FetchNextField
antoniovs1029 May 13, 2020
a427662
Added test
antoniovs1029 May 13, 2020
1cafbf0
Added test for column inference
antoniovs1029 May 14, 2020
6116d97
Make multilinereader a little bit more efficient
antoniovs1029 May 14, 2020
df2ca25
Create read multilines parameter
antoniovs1029 May 14, 2020
530f41e
Make tests run with readMultilines parameter
antoniovs1029 May 14, 2020
d9af9d2
Remove public parameter for readMultilines, as it is considered a bre…
antoniovs1029 May 15, 2020
6a7b632
Update manifest
antoniovs1029 May 15, 2020
fb6ab28
Update src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs
antoniovs1029 May 15, 2020
f592a7f
Update src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs
antoniovs1029 May 15, 2020
a13b803
Removed new readMultiline option from AutoML.NET and removed the test…
antoniovs1029 May 15, 2020
c2d2ac7
Merge branch 'is4460TextLoaderQuoting' of https://github.com/antoniov…
antoniovs1029 May 15, 2020
13033bf
Actually add new line characters to loaded strings
antoniovs1029 May 16, 2020
2983b06
Actually include new line characters in loaded strings
antoniovs1029 May 16, 2020
9789479
Let the TextSaver also correctly save new lines inside quoted fields
antoniovs1029 May 18, 2020
60c4169
Fixed bug when calling GetSomeLines
antoniovs1029 May 18, 2020
fa28ddd
Throw exception when reaching EOF without ending quote on a given field.
antoniovs1029 May 19, 2020
56279b4
Refactored readMultilines into OptionsFlags
antoniovs1029 May 19, 2020
51d9390
Added test to check new readMultiline option from MAML.
antoniovs1029 May 19, 2020
5cc7512
Merge remote-tracking branch 'upstream/master' into is4460TextLoaderQ…
antoniovs1029 May 19, 2020
5be5f6f
Fixed mistake on an unrelated test
antoniovs1029 May 19, 2020
b4e3029
Added internal default to ReadMultilines
antoniovs1029 May 19, 2020
7e16fc7
Do more checking in MultilineReader in order to be more flexible in a…
antoniovs1029 May 19, 2020
f0652a5
Updated tests
antoniovs1029 May 19, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 20 additions & 6 deletions src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -501,6 +501,13 @@ public class Options
[Argument(ArgumentType.AtMostOnce, HelpText = "Use separate parsing threads?", ShortName = "threads", Hide = true)]
public bool UseThreads = true;

/// <summary>
/// If true, new line characters are acceptable inside a quoted field, and thus one field can have multiple lines of text inside it
/// If <see cref="TextLoader.Options.AllowQuoting"/> is false, this option is ignored.
/// </summary>
[Argument(ArgumentType.AtMostOnce, HelpText = "Escape new line characters inside a quoted field? If AllowQuoting is false, this argument is ignored.", ShortName = "multilines", Hide = true)]
public bool ReadMultilines = false;

/// <summary>
/// File containing a header with feature names. If specified, the header defined in the data file is ignored regardless of <see cref="HasHeader"/>.
/// </summary>
Expand Down Expand Up @@ -694,11 +701,11 @@ public Bindings(TextLoader parent, Column[] cols, IMultiStreamSource headerFile,
ch.Assert(0 <= inputSize & inputSize < SrcLim);
List<ReadOnlyMemory<char>> lines = null;
if (headerFile != null)
Cursor.GetSomeLines(headerFile, 1, ref lines);
Cursor.GetSomeLines(headerFile, 1, parent.ReadMultilines, ref lines);
if (needInputSize && inputSize == 0)
Cursor.GetSomeLines(dataSample, 100, ref lines);
Cursor.GetSomeLines(dataSample, 100, parent.ReadMultilines, ref lines);
else if (headerFile == null && parent.HasHeader)
Cursor.GetSomeLines(dataSample, 1, ref lines);
Cursor.GetSomeLines(dataSample, 1, parent.ReadMultilines, ref lines);

if (needInputSize && inputSize == 0)
{
Expand Down Expand Up @@ -1055,7 +1062,7 @@ private static VersionInfo GetVersionInfo()
// verWrittenCur: 0x00010009, // Introduced _flags
//verWrittenCur: 0x0001000A, // Added ForceVector in Range
//verWrittenCur: 0x0001000B, // Header now retained if used and present
verWrittenCur: 0x0001000C, // Removed Min and Contiguous from KeyType
verWrittenCur: 0x0001000C, // Removed Min and Contiguous from KeyType, and added ReadMultilines flag to OptionFlags
verReadableCur: 0x0001000A,
verWeCanReadBack: 0x00010009,
loaderSignature: LoaderSignature,
Expand All @@ -1073,8 +1080,8 @@ private enum OptionFlags : uint
HasHeader = 0x02,
AllowQuoting = 0x04,
AllowSparse = 0x08,

All = TrimWhitespace | HasHeader | AllowQuoting | AllowSparse
ReadMultilines = 0x10,
All = TrimWhitespace | HasHeader | AllowQuoting | AllowSparse | ReadMultilines
}

// This is reserved to mean the range extends to the end (the segment is variable).
Expand All @@ -1095,6 +1102,11 @@ private bool HasHeader
get { return (_flags & OptionFlags.HasHeader) != 0; }
}

private bool ReadMultilines
{
get { return (_flags & OptionFlags.ReadMultilines) != 0; }
}

private readonly IHost _host;
private const string RegistrationName = "TextLoader";

Expand Down Expand Up @@ -1147,6 +1159,8 @@ internal TextLoader(IHostEnvironment env, Options options = null, IMultiStreamSo
_flags |= OptionFlags.AllowQuoting;
if (options.AllowSparse)
_flags |= OptionFlags.AllowSparse;
if (options.AllowQuoting && options.ReadMultilines)
_flags |= OptionFlags.ReadMultilines;

// REVIEW: This should be persisted (if it should be maintained).
_maxRows = options.MaxRows ?? long.MaxValue;
Expand Down
83 changes: 76 additions & 7 deletions src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
Expand Down Expand Up @@ -145,7 +146,7 @@ public static DataViewRowCursor Create(TextLoader parent, IMultiStreamSource fil
SetupCursor(parent, active, 0, out srcNeeded, out cthd);
Contracts.Assert(cthd > 0);

var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent._maxRows, 1);
var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._maxRows, 1);
var stats = new ParseStats(parent._host, 1);
return new Cursor(parent, stats, active, reader, srcNeeded, cthd);
}
Expand All @@ -162,7 +163,7 @@ public static DataViewRowCursor[] CreateSet(TextLoader parent, IMultiStreamSourc
SetupCursor(parent, active, n, out srcNeeded, out cthd);
Contracts.Assert(cthd > 0);

var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent._maxRows, cthd);
var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._maxRows, cthd);
var stats = new ParseStats(parent._host, cthd);
if (cthd <= 1)
return new DataViewRowCursor[1] { new Cursor(parent, stats, active, reader, srcNeeded, 1) };
Expand Down Expand Up @@ -204,7 +205,7 @@ public override ValueGetter<DataViewRowId> GetIdGetter()
};
}

public static void GetSomeLines(IMultiStreamSource source, int count, ref List<ReadOnlyMemory<char>> lines)
public static void GetSomeLines(IMultiStreamSource source, int count, bool readMultilines, ref List<ReadOnlyMemory<char>> lines)
{
Contracts.AssertValue(source);
Contracts.Assert(count > 0);
Expand All @@ -214,7 +215,7 @@ public static void GetSomeLines(IMultiStreamSource source, int count, ref List<R
count = 2;

LineBatch batch;
var reader = new LineReader(source, count, 1, false, count, 1);
var reader = new LineReader(source, count, 1, false, readMultilines, count, 1);
try
{
batch = reader.GetBatch();
Expand Down Expand Up @@ -401,6 +402,7 @@ private sealed class LineReader
{
private readonly long _limit;
private readonly bool _hasHeader;
private readonly bool _readMultilines;
private readonly int _batchSize;
private readonly IMultiStreamSource _files;

Expand All @@ -410,7 +412,7 @@ private sealed class LineReader
private Task _thdRead;
private volatile bool _abort;

public LineReader(IMultiStreamSource files, int batchSize, int bufSize, bool hasHeader, long limit, int cref)
public LineReader(IMultiStreamSource files, int batchSize, int bufSize, bool hasHeader, bool readMultilines, long limit, int cref)
{
// Note that files is allowed to be empty.
Contracts.AssertValue(files);
Expand All @@ -423,6 +425,7 @@ public LineReader(IMultiStreamSource files, int batchSize, int bufSize, bool has
_limit = limit;
_hasHeader = hasHeader;
_batchSize = batchSize;
_readMultilines = readMultilines;
_files = files;
_cref = cref;

Expand Down Expand Up @@ -464,9 +467,67 @@ public LineBatch GetBatch()
throw Contracts.ExceptDecode(batch.Exception, "Stream reading encountered exception");
}

private static class MultiLineReader
{
// When reading lines that contain quoted fields, the quoted fields can contain
// '\n' so we we'll need to read multiple lines (multilines) to get all the fields
// of a given row.
public static string ReadMultiLine(TextReader sr, StringBuilder sb, long lineNum, bool ignoreHashLine)
{
string line;
line = sr.ReadLine();

// if it was an empty line or if we've reached the end of file (i.e. line = null)
if (string.IsNullOrEmpty(line))
return line;

// In ML.NET we filter out lines beginning with // and # at the beginning of the file
// Or lines beginning with // elsewhere in the file.
// Thus, we don't care to check if there's a quoted multiline when the line begins with
// these chars.
if (line[0] == '/' && line[1] == '/')
return line;
if (ignoreHashLine && line[0] == '#')
return line;
Copy link
Contributor

@justinormont justinormont May 15, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should these check if you're at the top of the file? I think once you've hit the first row of data, no more ignoring of rows is allowed. #Resolved

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Current behavior is that at the beginning of the file both // and # are ignored by LineReader link, but after the first row of data is hit, LineReader only ignores // link

Because of this, I actually added the ignoreHashLine flag on this method, and use it on the code you've pointed to.

One of the tests I've added on this PR actually shows that "// lines" are ignored correctly throughout the file.


In reply to: 425748062 [](ancestors = 425748062)


// Get more lines until the number of quote characters is even
// 2 consecutive quotes are considered scaped quotes
long numOfQuotes = GetNumberOfChars(line, '"');
if (numOfQuotes % 2 == 0)
return line;

sb.Clear();
sb.Append(line);
while (numOfQuotes % 2 != 0)
Copy link
Contributor

@justinormont justinormont May 19, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How would you handle the case

Label,Size,Age
true,5.4",5

I think we need to check that a quoted field begins with the quote character (vs only includes ones). If the field isn't quoted, it can include non-escaped quote chars. If it's quoted, all quotes must be escaped within the field. #Resolved

Copy link
Member Author

@antoniovs1029 antoniovs1029 May 19, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

According to the RFC your row isn't valid, as " should always be either escaped, or be used to indicate a quoted field. So I assumed that datasets should comply with RFC when using ReadMultiline, to make the MultilineReader code less complicated (and faster).

Should we actually accept this even when ReadMultiline is true? #Resolved

Copy link
Member Author

@antoniovs1029 antoniovs1029 May 19, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mmm ok, after thinking about this a bit, I will anyways have to change the logic in this method for the escapeChar feature, so I might just as well allow some subset of cases similar to the one you pointed out (i.e. using quotes inside an unquoted field, and not counting them as invalid quotes). I guess it's worth it to avoid users hitting the EOF exception and/or loading much of the text into memory which are things that would occur with invalid CSVs such as this one if I don't take special care of this kind of case.


In reply to: 427003836 [](ancestors = 427003836)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've done this. Let us follow this thread here:
#5125 (comment)


In reply to: 427010189 [](ancestors = 427010189,427003836)

{
line = sr.ReadLine();

if (line == null) // If we've reached the end of the file
throw new EndOfStreamException($"A quote opened on a field on line {lineNum} was never closed, and we've read to the last line in the file without finding the closing quote");

sb.Append("\n");
sb.Append(line);
numOfQuotes += GetNumberOfChars(line, '"');
}

return sb.ToString();
}

public static int GetNumberOfChars(string line, char ch)
{
Copy link
Contributor

@harishsk harishsk May 14, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is line.ToCharArray().Count(c => c == ch) a better approach? #Resolved

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought about it, but I read here that actually the most efficient way is to iterate over the chars:
https://stackoverflow.com/a/10391548


In reply to: 425452787 [](ancestors = 425452787)

Copy link
Contributor

@justinormont justinormont May 15, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If perf testing shows excessive time spent in this function, you could argument the existing C++/assembly intrinsics w/ a character counter. I think _mm256_cmpeq_epi8 would work to accelerate the comparison. #Resolved

int count = 0;
foreach (char c in line)
{
if (c == ch) count++;
}
return count;
}
}

private void ThreadProc()
{
Contracts.Assert(_batchSize >= 2);
var multilineSB = new StringBuilder();

try
{
Expand All @@ -487,7 +548,11 @@ private void ThreadProc()
// REVIEW: Avoid allocating a string for every line. This would probably require
// introducing a CharSpan type (similar to ReadOnlyMemory but based on char[] or StringBuilder)
// and implementing all the necessary conversion functionality on it. See task 3871.
text = rdr.ReadLine();
if (_readMultilines)
text = MultiLineReader.ReadMultiLine(rdr, multilineSB, line, true);
else
text = rdr.ReadLine();

if (text == null)
goto LNext;
line++;
Expand All @@ -514,7 +579,11 @@ private void ThreadProc()
if (_abort)
return;

text = rdr.ReadLine();
if (_readMultilines)
text = MultiLineReader.ReadMultiLine(rdr, multilineSB, line, false);
else
text = rdr.ReadLine();

if (text == null)
{
// We're done with this file. Queue the last partial batch.
Expand Down
5 changes: 5 additions & 0 deletions src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1161,6 +1161,11 @@ private bool FetchNextField(ref ScanInfo scan, ReadOnlySpan<char> span)
scan.QuotingError = true;
break;
}

// The logic below allow us to escape quotes (") inside quoted
// fields by using doublo quotes (""). I.e. when the loader
// encounters "" inside a quoted field, it will output only one "
// and continue parsing the rest of the field.
if (span[ichCur] == '"')
Copy link
Contributor

@justinormont justinormont May 15, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might want to expose this character as a parameter. The other common choice is backslash escaping. #Resolved

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To implement the feature you're suggesting on this comment I do believe that we only need to change the code you've pointed to in here, so it's kind of straightforward.

Right now I am going to focus on making sure this PR works well with AutoML/Modelbuilder, that TextSaver works with new lines and that we handle correctly the case of having a badly formatted input file. If I can tackle this with enough time before the next release, then I will try to implement this suggestion.


In reply to: 425777440 [](ancestors = 425777440)

Copy link
Member Author

@antoniovs1029 antoniovs1029 May 15, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Change in plans, I won't address surfacing the new readMultiline option to AutoML/ModelBuilder on this PR. So now I will actually focus on the other things, including implementing this escapechar option before the next release. See #5125 (comment)


In reply to: 425777440 [](ancestors = 425777440)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note to myself: I've just realized that to implement the escapechar feature, then I'll also need to change the logic in the MultilineReader, because if the escapechar isn't " then checking for even number of quotes won't be enough.


In reply to: 425777440 [](ancestors = 425777440)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As discussed offline, I'll address this in an upcoming PR.


In reply to: 426128704 [](ancestors = 426128704,425777440)

{
if (ichCur > ichRun)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog,
HasHeader = hasHeader,
AllowQuoting = allowQuoting,
TrimWhitespace = trimWhitespace,
AllowSparse = allowSparse
AllowSparse = allowSparse,
};

return new TextLoader(CatalogUtils.GetEnvironment(catalog), options: options, dataSample: dataSample);
Expand Down
2 changes: 1 addition & 1 deletion src/Microsoft.ML.Data/DataLoadSave/Text/TextSaver.cs
Original file line number Diff line number Diff line change
Expand Up @@ -827,7 +827,7 @@ internal static void MapText(ReadOnlySpan<char> span, ref StringBuilder sb, char
for (; ichCur < ichLim; ichCur++)
{
char ch = span[ichCur];
if (ch != '"' && ch != sep && ch != ':')
if (ch != '"' && ch != sep && ch != ':' && ch != '\n')
continue;
if (!quoted)
{
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#@ TextLoader{
#@ header+
#@ sep=tab
#@ col=id:R4:0
#@ col=description:TX:1
#@ col=animal:TX:2
#@ }
id description animal
10 this is a description dog
11 this is a quoted description cat
12 "this is a multiline
quoted description" bird
13 "this has one""doublequote which should be escaped as a single quote" dog
14 "this has ""doublequotes"" inside of it" cat
15 "this is a multiline
quoted description with

""doublequotes"" and

empty new lines and



escaped quotes inside of ""it""

//and this comment with // shouldn't be ignored
since it is part of the ""multiline""" bird
16 here is text after the empty line dog
17 this is a line with an empty animal ""
0 this is a line with an empty id bird
19 "" dog
20 this is the last row description cat
Wrote 11 rows of length 3
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#@ TextLoader{
#@ header+
#@ sep=tab
#@ col=id:R4:0
#@ col=description:TX:1
#@ col=animal:TX:2
#@ }
id description animal
10 this is a description dog
11 this is a quoted description cat
12 "this is a multiline
quoted description" bird
13 "this has one""doublequote which should be escaped as a single quote" dog
14 "this has ""doublequotes"" inside of it" cat
15 "this is a multiline
quoted description with

""doublequotes"" and

empty new lines and



escaped quotes inside of ""it""

//and this comment with // shouldn't be ignored
since it is part of the ""multiline""" bird
16 here is text after the empty line dog
17 this is a line with an empty animal ""
0 this is a line with an empty id bird
19 "" dog
20 this is the last row description cat
Wrote 11 rows of length 3
12 changes: 12 additions & 0 deletions test/BaselineOutput/Common/EntryPoints/core_manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -405,6 +405,18 @@
"IsNullable": false,
"Default": true
},
{
"Name": "ReadMultilines",
"Type": "Bool",
"Desc": "Escape new line characters inside a quoted field? If AllowQuoting is false, this argument is ignored.",
"Aliases": [
"multilines"
],
"Required": false,
"SortOrder": 150.0,
"IsNullable": false,
"Default": false
},
{
"Name": "HeaderFile",
"Type": "String",
Expand Down
33 changes: 33 additions & 0 deletions test/BaselineOutput/Common/TextLoader/multiline.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
10
11
12
13
14
15
16
17
0
19
20
this is a description
this is a quoted description
this is a multiline\nquoted description
this has one"doublequote which should be escaped as a single quote
this has "doublequotes" inside of it
this is a multiline\nquoted description with\n\n"doublequotes" and\n\nempty new lines and\n\n\n\nescaped quotes inside of "it"\n\n//and this comment with // shouldn't be ignored\nsince it is part of the "multiline"
here is text after the empty line
this is a line with an empty animal
this is a line with an empty id

this is the last row description
dog
cat
bird
dog
cat
bird
dog

bird
dog
cat
Loading