-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Enable TextLoader to accept new lines in quoted fields #5125
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 14 commits
e03b0e4
3ddb3b0
9b20ead
34d9efa
ac6f971
8304d62
a9e91e2
9cfaee1
2827c61
a427662
1cafbf0
6116d97
df2ca25
530f41e
d9af9d2
6a7b632
fb6ab28
f592a7f
a13b803
c2d2ac7
13033bf
2983b06
9789479
60c4169
fa28ddd
56279b4
51d9390
5cc7512
5be5f6f
b4e3029
7e16fc7
f0652a5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,6 +4,7 @@ | |
|
||
using System; | ||
using System.Collections.Generic; | ||
using System.IO; | ||
using System.Text; | ||
using System.Threading; | ||
using System.Threading.Tasks; | ||
|
@@ -145,7 +146,7 @@ public static DataViewRowCursor Create(TextLoader parent, IMultiStreamSource fil | |
SetupCursor(parent, active, 0, out srcNeeded, out cthd); | ||
Contracts.Assert(cthd > 0); | ||
|
||
var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent._maxRows, 1); | ||
var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent._readMultilines, parent._maxRows, 1); | ||
var stats = new ParseStats(parent._host, 1); | ||
return new Cursor(parent, stats, active, reader, srcNeeded, cthd); | ||
} | ||
|
@@ -162,7 +163,7 @@ public static DataViewRowCursor[] CreateSet(TextLoader parent, IMultiStreamSourc | |
SetupCursor(parent, active, n, out srcNeeded, out cthd); | ||
Contracts.Assert(cthd > 0); | ||
|
||
var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent._maxRows, cthd); | ||
var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent._readMultilines, parent._maxRows, cthd); | ||
var stats = new ParseStats(parent._host, cthd); | ||
if (cthd <= 1) | ||
return new DataViewRowCursor[1] { new Cursor(parent, stats, active, reader, srcNeeded, 1) }; | ||
|
@@ -204,7 +205,7 @@ public override ValueGetter<DataViewRowId> GetIdGetter() | |
}; | ||
} | ||
|
||
public static void GetSomeLines(IMultiStreamSource source, int count, ref List<ReadOnlyMemory<char>> lines) | ||
public static void GetSomeLines(IMultiStreamSource source, int count, bool readMultilines, ref List<ReadOnlyMemory<char>> lines) | ||
{ | ||
Contracts.AssertValue(source); | ||
Contracts.Assert(count > 0); | ||
|
@@ -214,7 +215,7 @@ public static void GetSomeLines(IMultiStreamSource source, int count, ref List<R | |
count = 2; | ||
|
||
LineBatch batch; | ||
var reader = new LineReader(source, count, 1, false, count, 1); | ||
var reader = new LineReader(source, count, 1, false, readMultilines, count, 1); | ||
try | ||
{ | ||
batch = reader.GetBatch(); | ||
|
@@ -401,6 +402,7 @@ private sealed class LineReader | |
{ | ||
private readonly long _limit; | ||
private readonly bool _hasHeader; | ||
private readonly bool _readMultilines; | ||
private readonly int _batchSize; | ||
private readonly IMultiStreamSource _files; | ||
|
||
|
@@ -410,7 +412,7 @@ private sealed class LineReader | |
private Task _thdRead; | ||
private volatile bool _abort; | ||
|
||
public LineReader(IMultiStreamSource files, int batchSize, int bufSize, bool hasHeader, long limit, int cref) | ||
public LineReader(IMultiStreamSource files, int batchSize, int bufSize, bool hasHeader, bool readMultilines, long limit, int cref) | ||
{ | ||
// Note that files is allowed to be empty. | ||
Contracts.AssertValue(files); | ||
|
@@ -423,6 +425,7 @@ public LineReader(IMultiStreamSource files, int batchSize, int bufSize, bool has | |
_limit = limit; | ||
_hasHeader = hasHeader; | ||
_batchSize = batchSize; | ||
_readMultilines = readMultilines; | ||
_files = files; | ||
_cref = cref; | ||
|
||
|
@@ -464,9 +467,69 @@ public LineBatch GetBatch() | |
throw Contracts.ExceptDecode(batch.Exception, "Stream reading encountered exception"); | ||
} | ||
|
||
private static class MultiLineReader | ||
{ | ||
// When reading lines that contain quoted fields, the quoted fields can contain | ||
// '\n' so we we'll need to read multiple lines (multilines) to get all the fields | ||
// of a given row. | ||
public static string ReadMultiLine(TextReader sr, StringBuilder sb, bool ignoreHashLine) | ||
{ | ||
string line; | ||
line = sr.ReadLine(); | ||
|
||
// if it was an empty line or if we've reached the end of file (i.e. line = null) | ||
if (string.IsNullOrEmpty(line)) | ||
return line; | ||
|
||
// In ML.NET we filter out lines beginning with // and # at the beginning of the file | ||
// Or lines beginning with // elsewhere in the file. | ||
// Thus, we don't care to check if there's a quoted multiline when the line begins with | ||
// these chars. | ||
if (line[0] == '/' && line[1] == '/') | ||
return line; | ||
if (ignoreHashLine && line[0] == '#') | ||
return line; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should these check if you're at the top of the file? I think once you've hit the first row of data, no more ignoring of rows is allowed. #Resolved There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Current behavior is that at the beginning of the file both // and # are ignored by LineReader link, but after the first row of data is hit, LineReader only ignores // link Because of this, I actually added the ignoreHashLine flag on this method, and use it on the code you've pointed to. One of the tests I've added on this PR actually shows that "// lines" are ignored correctly throughout the file. In reply to: 425748062 [](ancestors = 425748062) |
||
|
||
// Get more lines until the number of quote characters is even | ||
// 2 consecutive quotes are considered scaped quotes | ||
long numOfQuotes = GetNumberOfChars(line, '"'); | ||
if (numOfQuotes % 2 == 0) | ||
antoniovs1029 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return line; | ||
|
||
sb.Clear(); | ||
sb.Append(line); | ||
while (numOfQuotes % 2 != 0) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How would you handle the case
I think we need to check that a quoted field begins with the quote character (vs only includes ones). If the field isn't quoted, it can include non-escaped quote chars. If it's quoted, all quotes must be escaped within the field. #Resolved There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. According to the RFC your row isn't valid, as " should always be either escaped, or be used to indicate a quoted field. So I assumed that datasets should comply with RFC when using Should we actually accept this even when ReadMultiline is true? #Resolved There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Mmm ok, after thinking about this a bit, I will anyways have to change the logic in this method for the escapeChar feature, so I might just as well allow some subset of cases similar to the one you pointed out (i.e. using quotes inside an unquoted field, and not counting them as invalid quotes). I guess it's worth it to avoid users hitting the EOF exception and/or loading much of the text into memory which are things that would occur with invalid CSVs such as this one if I don't take special care of this kind of case. In reply to: 427003836 [](ancestors = 427003836) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've done this. Let us follow this thread here: In reply to: 427010189 [](ancestors = 427010189,427003836) |
||
{ | ||
line = sr.ReadLine(); | ||
|
||
if (line == null) // If we've reached the end of the file | ||
break; // MYTODO: This could happen if we have an invalid open quote which never closes so we reach the end of the file without properly closing the field, should we throw instead in this case? | ||
antoniovs1029 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
if(line.Length != 0) | ||
justinormont marked this conversation as resolved.
Show resolved
Hide resolved
|
||
sb.Append(" "); // MYTODO: should we use instead a "\n" in here to separate lines? | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @justinormont recommended adding an actual new line here instead of a space. But I'm not sure if it would matter much. Also I wouldn't know when to append "\n" instead of "\r\n", or if it would make any difference. #Resolved There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep. We'll want newlines instead of spaces. The same as newlines are different to human readers, the ML can just as well make use of this information. We also have no control over how the user featurizes their dataset in the ML pipeline; the newlines could be important to their processing. We also would want to maintain round trip read/write of the dataset. Which means we'd have to modify the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've done this in a more recent iteration. To continue discussing about this, it'd be better to do it here: In reply to: 425757137 [](ancestors = 425757137) |
||
|
||
sb.Append(line); | ||
numOfQuotes += GetNumberOfChars(line, '"'); | ||
} | ||
|
||
return sb.ToString(); | ||
} | ||
|
||
public static int GetNumberOfChars(string line, char ch) | ||
{ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I thought about it, but I read here that actually the most efficient way is to iterate over the chars: In reply to: 425452787 [](ancestors = 425452787) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If perf testing shows excessive time spent in this function, you could argument the existing C++/assembly intrinsics w/ a character counter. I think |
||
int count = 0; | ||
foreach (char c in line) | ||
{ | ||
if (c == ch) count++; | ||
} | ||
return count; | ||
} | ||
} | ||
|
||
private void ThreadProc() | ||
{ | ||
Contracts.Assert(_batchSize >= 2); | ||
var multilineSB = new StringBuilder(); | ||
|
||
try | ||
{ | ||
|
@@ -487,7 +550,11 @@ private void ThreadProc() | |
// REVIEW: Avoid allocating a string for every line. This would probably require | ||
// introducing a CharSpan type (similar to ReadOnlyMemory but based on char[] or StringBuilder) | ||
// and implementing all the necessary conversion functionality on it. See task 3871. | ||
text = rdr.ReadLine(); | ||
if (_readMultilines) | ||
text = MultiLineReader.ReadMultiLine(rdr, multilineSB, true); | ||
else | ||
text = rdr.ReadLine(); | ||
|
||
if (text == null) | ||
goto LNext; | ||
line++; | ||
|
@@ -514,7 +581,11 @@ private void ThreadProc() | |
if (_abort) | ||
return; | ||
|
||
text = rdr.ReadLine(); | ||
if (_readMultilines) | ||
text = MultiLineReader.ReadMultiLine(rdr, multilineSB, true); | ||
else | ||
text = rdr.ReadLine(); | ||
|
||
if (text == null) | ||
{ | ||
// We're done with this file. Queue the last partial batch. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1161,6 +1161,11 @@ private bool FetchNextField(ref ScanInfo scan, ReadOnlySpan<char> span) | |
scan.QuotingError = true; | ||
break; | ||
} | ||
|
||
// The logic below allow us to scape quotes (") inside quoted | ||
antoniovs1029 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// fields by using doublo quotes (""). I.e. when the loader | ||
// encounters "" inside a quoted field, it will output only one " | ||
// and continue parsing the rest of the field. | ||
if (span[ichCur] == '"') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Might want to expose this character as a parameter. The other common choice is backslash escaping. #Resolved There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To implement the feature you're suggesting on this comment I do believe that we only need to change the code you've pointed to in here, so it's kind of straightforward. Right now I am going to focus on making sure this PR works well with AutoML/Modelbuilder, that TextSaver works with new lines and that we handle correctly the case of having a badly formatted input file. If I can tackle this with enough time before the next release, then I will try to implement this suggestion. In reply to: 425777440 [](ancestors = 425777440) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Change in plans, I won't address surfacing the new readMultiline option to AutoML/ModelBuilder on this PR. So now I will actually focus on the other things, including implementing this escapechar option before the next release. See #5125 (comment) In reply to: 425777440 [](ancestors = 425777440) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note to myself: I've just realized that to implement the escapechar feature, then I'll also need to change the logic in the MultilineReader, because if the escapechar isn't In reply to: 425777440 [](ancestors = 425777440) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As discussed offline, I'll address this in an upcoming PR. In reply to: 426128704 [](ancestors = 426128704,425777440) |
||
{ | ||
if (ichCur > ichRun) | ||
|
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So I am not familiar at all with AutoML code, but in order to add a test reproducing the scenario in #4460 I had to add this in here for it to work. Problem is that these methods in AutoML and several more methods there would require to either hardcode this or add more parameters to several methods to have a readMultilines parameter.
This is a problem because ReadMultilines is set as false by default, and we would need to manually set it as true anywhere where we'd want AutoML to have this behavior. So I am not sure what would be the best way to handle this from AutoML side. #Resolved
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I expect it's better to sweep over the
ReadMultilines
flag. The same as we do with the other flags forTextLoader
(src).Are there any datasets (or rows) which would be unreadable with
ReadMultilines = true
? #ResolvedUh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As I mention here: #5125 (comment) , if the CSV file contains a quoted field which has a " at the beginning of the field, but it never closes it (which would technically be invalid format), then the file won't be read correctly and some problems may arise from it.
For example, our "wikipedia-detox-250-line-data.tsv" has this error in line 83. The sentiment text field there opens with " but it never closes that quotation, so in that dataset the Multilinereader doesn't load that file correctly.
In reply to: 425713757 [](ancestors = 425713757)
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Regarding your suggestion of simply sweeping over the flags in TrySplitColumns, it doesn't look that would affect the InferColumns method I'm mentioning in this thread (it looks to me the InferColumns method isn't called when calling TrySplitColumns). As you can see in the ColumnInferenceApi, there are several methods which have multiple parameters related to TextLoader, and it seems I'd need to add a new parameter to all of them. But my question is that I wouldn't know if it's necessary to add it in all of them to make the readMultiline parameter accessible to ModelBuilder. Furthermore, it seems these methods are called by the AutoCatalog.cs which also exposes some public methods with parameters related to TextLoader. And it would again seem that I'd need to add a new readMultiline parameter there... but I guess won't be able to do that since that is a public API, and adding parameters is considered a breaking API change.
So, in general, I don't know which criteria to use in order to know where to add a readMultiline parameter in order to make it accessible for ModelBuilder. Is there any other place besides ColumnInferenceAPI and TrySplitColumns?
In reply to: 425713757 [](ancestors = 425713757)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As discussed in here: I won't change anything in AutoML.NET code, and will leave this up to ModelBuilder team.
#5125 (comment)
In reply to: 425726377 [](ancestors = 425726377,425713757)