-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Enable TextLoader to accept new lines in quoted fields #5125
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 29 commits
e03b0e4
3ddb3b0
9b20ead
34d9efa
ac6f971
8304d62
a9e91e2
9cfaee1
2827c61
a427662
1cafbf0
6116d97
df2ca25
530f41e
d9af9d2
6a7b632
fb6ab28
f592a7f
a13b803
c2d2ac7
13033bf
2983b06
9789479
60c4169
fa28ddd
56279b4
51d9390
5cc7512
5be5f6f
b4e3029
7e16fc7
f0652a5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,6 +4,7 @@ | |
|
||
using System; | ||
using System.Collections.Generic; | ||
using System.IO; | ||
using System.Text; | ||
using System.Threading; | ||
using System.Threading.Tasks; | ||
|
@@ -145,7 +146,7 @@ public static DataViewRowCursor Create(TextLoader parent, IMultiStreamSource fil | |
SetupCursor(parent, active, 0, out srcNeeded, out cthd); | ||
Contracts.Assert(cthd > 0); | ||
|
||
var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent._maxRows, 1); | ||
var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._maxRows, 1); | ||
var stats = new ParseStats(parent._host, 1); | ||
return new Cursor(parent, stats, active, reader, srcNeeded, cthd); | ||
} | ||
|
@@ -162,7 +163,7 @@ public static DataViewRowCursor[] CreateSet(TextLoader parent, IMultiStreamSourc | |
SetupCursor(parent, active, n, out srcNeeded, out cthd); | ||
Contracts.Assert(cthd > 0); | ||
|
||
var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent._maxRows, cthd); | ||
var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._maxRows, cthd); | ||
var stats = new ParseStats(parent._host, cthd); | ||
if (cthd <= 1) | ||
return new DataViewRowCursor[1] { new Cursor(parent, stats, active, reader, srcNeeded, 1) }; | ||
|
@@ -204,7 +205,7 @@ public override ValueGetter<DataViewRowId> GetIdGetter() | |
}; | ||
} | ||
|
||
public static void GetSomeLines(IMultiStreamSource source, int count, ref List<ReadOnlyMemory<char>> lines) | ||
public static void GetSomeLines(IMultiStreamSource source, int count, bool readMultilines, ref List<ReadOnlyMemory<char>> lines) | ||
{ | ||
Contracts.AssertValue(source); | ||
Contracts.Assert(count > 0); | ||
|
@@ -214,7 +215,7 @@ public static void GetSomeLines(IMultiStreamSource source, int count, ref List<R | |
count = 2; | ||
|
||
LineBatch batch; | ||
var reader = new LineReader(source, count, 1, false, count, 1); | ||
var reader = new LineReader(source, count, 1, false, readMultilines, count, 1); | ||
try | ||
{ | ||
batch = reader.GetBatch(); | ||
|
@@ -401,6 +402,7 @@ private sealed class LineReader | |
{ | ||
private readonly long _limit; | ||
private readonly bool _hasHeader; | ||
private readonly bool _readMultilines; | ||
private readonly int _batchSize; | ||
private readonly IMultiStreamSource _files; | ||
|
||
|
@@ -410,7 +412,7 @@ private sealed class LineReader | |
private Task _thdRead; | ||
private volatile bool _abort; | ||
|
||
public LineReader(IMultiStreamSource files, int batchSize, int bufSize, bool hasHeader, long limit, int cref) | ||
public LineReader(IMultiStreamSource files, int batchSize, int bufSize, bool hasHeader, bool readMultilines, long limit, int cref) | ||
{ | ||
// Note that files is allowed to be empty. | ||
Contracts.AssertValue(files); | ||
|
@@ -423,6 +425,7 @@ public LineReader(IMultiStreamSource files, int batchSize, int bufSize, bool has | |
_limit = limit; | ||
_hasHeader = hasHeader; | ||
_batchSize = batchSize; | ||
_readMultilines = readMultilines; | ||
_files = files; | ||
_cref = cref; | ||
|
||
|
@@ -464,9 +467,67 @@ public LineBatch GetBatch() | |
throw Contracts.ExceptDecode(batch.Exception, "Stream reading encountered exception"); | ||
} | ||
|
||
private static class MultiLineReader | ||
{ | ||
// When reading lines that contain quoted fields, the quoted fields can contain | ||
// '\n' so we we'll need to read multiple lines (multilines) to get all the fields | ||
// of a given row. | ||
public static string ReadMultiLine(TextReader sr, StringBuilder sb, long lineNum, bool ignoreHashLine) | ||
{ | ||
string line; | ||
line = sr.ReadLine(); | ||
|
||
// if it was an empty line or if we've reached the end of file (i.e. line = null) | ||
if (string.IsNullOrEmpty(line)) | ||
return line; | ||
|
||
// In ML.NET we filter out lines beginning with // and # at the beginning of the file | ||
// Or lines beginning with // elsewhere in the file. | ||
// Thus, we don't care to check if there's a quoted multiline when the line begins with | ||
// these chars. | ||
if (line[0] == '/' && line[1] == '/') | ||
return line; | ||
if (ignoreHashLine && line[0] == '#') | ||
return line; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should these check if you're at the top of the file? I think once you've hit the first row of data, no more ignoring of rows is allowed. #Resolved There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Current behavior is that at the beginning of the file both // and # are ignored by LineReader link, but after the first row of data is hit, LineReader only ignores // link Because of this, I actually added the ignoreHashLine flag on this method, and use it on the code you've pointed to. One of the tests I've added on this PR actually shows that "// lines" are ignored correctly throughout the file. In reply to: 425748062 [](ancestors = 425748062) |
||
|
||
// Get more lines until the number of quote characters is even | ||
// 2 consecutive quotes are considered scaped quotes | ||
long numOfQuotes = GetNumberOfChars(line, '"'); | ||
if (numOfQuotes % 2 == 0) | ||
antoniovs1029 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return line; | ||
|
||
sb.Clear(); | ||
sb.Append(line); | ||
while (numOfQuotes % 2 != 0) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How would you handle the case
I think we need to check that a quoted field begins with the quote character (vs only includes ones). If the field isn't quoted, it can include non-escaped quote chars. If it's quoted, all quotes must be escaped within the field. #Resolved There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. According to the RFC your row isn't valid, as " should always be either escaped, or be used to indicate a quoted field. So I assumed that datasets should comply with RFC when using Should we actually accept this even when ReadMultiline is true? #Resolved There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Mmm ok, after thinking about this a bit, I will anyways have to change the logic in this method for the escapeChar feature, so I might just as well allow some subset of cases similar to the one you pointed out (i.e. using quotes inside an unquoted field, and not counting them as invalid quotes). I guess it's worth it to avoid users hitting the EOF exception and/or loading much of the text into memory which are things that would occur with invalid CSVs such as this one if I don't take special care of this kind of case. In reply to: 427003836 [](ancestors = 427003836) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've done this. Let us follow this thread here: In reply to: 427010189 [](ancestors = 427010189,427003836) |
||
{ | ||
line = sr.ReadLine(); | ||
|
||
if (line == null) // If we've reached the end of the file | ||
throw new EndOfStreamException($"A quote opened on a field on line {lineNum} was never closed, and we've read to the last line in the file without finding the closing quote"); | ||
|
||
sb.Append("\n"); | ||
sb.Append(line); | ||
numOfQuotes += GetNumberOfChars(line, '"'); | ||
} | ||
|
||
return sb.ToString(); | ||
} | ||
|
||
public static int GetNumberOfChars(string line, char ch) | ||
{ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I thought about it, but I read here that actually the most efficient way is to iterate over the chars: In reply to: 425452787 [](ancestors = 425452787) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If perf testing shows excessive time spent in this function, you could argument the existing C++/assembly intrinsics w/ a character counter. I think |
||
int count = 0; | ||
foreach (char c in line) | ||
{ | ||
if (c == ch) count++; | ||
} | ||
return count; | ||
} | ||
} | ||
|
||
private void ThreadProc() | ||
{ | ||
Contracts.Assert(_batchSize >= 2); | ||
var multilineSB = new StringBuilder(); | ||
|
||
try | ||
{ | ||
|
@@ -487,7 +548,11 @@ private void ThreadProc() | |
// REVIEW: Avoid allocating a string for every line. This would probably require | ||
// introducing a CharSpan type (similar to ReadOnlyMemory but based on char[] or StringBuilder) | ||
// and implementing all the necessary conversion functionality on it. See task 3871. | ||
text = rdr.ReadLine(); | ||
if (_readMultilines) | ||
text = MultiLineReader.ReadMultiLine(rdr, multilineSB, line, true); | ||
else | ||
text = rdr.ReadLine(); | ||
|
||
if (text == null) | ||
goto LNext; | ||
line++; | ||
|
@@ -514,7 +579,11 @@ private void ThreadProc() | |
if (_abort) | ||
return; | ||
|
||
text = rdr.ReadLine(); | ||
if (_readMultilines) | ||
text = MultiLineReader.ReadMultiLine(rdr, multilineSB, line, false); | ||
else | ||
text = rdr.ReadLine(); | ||
|
||
if (text == null) | ||
{ | ||
// We're done with this file. Queue the last partial batch. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1161,6 +1161,11 @@ private bool FetchNextField(ref ScanInfo scan, ReadOnlySpan<char> span) | |
scan.QuotingError = true; | ||
break; | ||
} | ||
|
||
// The logic below allow us to escape quotes (") inside quoted | ||
// fields by using doublo quotes (""). I.e. when the loader | ||
// encounters "" inside a quoted field, it will output only one " | ||
// and continue parsing the rest of the field. | ||
if (span[ichCur] == '"') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Might want to expose this character as a parameter. The other common choice is backslash escaping. #Resolved There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To implement the feature you're suggesting on this comment I do believe that we only need to change the code you've pointed to in here, so it's kind of straightforward. Right now I am going to focus on making sure this PR works well with AutoML/Modelbuilder, that TextSaver works with new lines and that we handle correctly the case of having a badly formatted input file. If I can tackle this with enough time before the next release, then I will try to implement this suggestion. In reply to: 425777440 [](ancestors = 425777440) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Change in plans, I won't address surfacing the new readMultiline option to AutoML/ModelBuilder on this PR. So now I will actually focus on the other things, including implementing this escapechar option before the next release. See #5125 (comment) In reply to: 425777440 [](ancestors = 425777440) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note to myself: I've just realized that to implement the escapechar feature, then I'll also need to change the logic in the MultilineReader, because if the escapechar isn't In reply to: 425777440 [](ancestors = 425777440) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As discussed offline, I'll address this in an upcoming PR. In reply to: 426128704 [](ancestors = 426128704,425777440) |
||
{ | ||
if (ichCur > ichRun) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
#@ TextLoader{ | ||
#@ header+ | ||
#@ sep=tab | ||
#@ col=id:R4:0 | ||
#@ col=description:TX:1 | ||
#@ col=animal:TX:2 | ||
#@ } | ||
id description animal | ||
10 this is a description dog | ||
11 this is a quoted description cat | ||
12 "this is a multiline | ||
quoted description" bird | ||
13 "this has one""doublequote which should be escaped as a single quote" dog | ||
14 "this has ""doublequotes"" inside of it" cat | ||
15 "this is a multiline | ||
quoted description with | ||
|
||
""doublequotes"" and | ||
|
||
empty new lines and | ||
|
||
|
||
|
||
escaped quotes inside of ""it"" | ||
|
||
//and this comment with // shouldn't be ignored | ||
since it is part of the ""multiline""" bird | ||
16 here is text after the empty line dog | ||
17 this is a line with an empty animal "" | ||
0 this is a line with an empty id bird | ||
19 "" dog | ||
20 this is the last row description cat | ||
Wrote 11 rows of length 3 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
#@ TextLoader{ | ||
#@ header+ | ||
#@ sep=tab | ||
#@ col=id:R4:0 | ||
#@ col=description:TX:1 | ||
#@ col=animal:TX:2 | ||
#@ } | ||
id description animal | ||
10 this is a description dog | ||
11 this is a quoted description cat | ||
12 "this is a multiline | ||
quoted description" bird | ||
13 "this has one""doublequote which should be escaped as a single quote" dog | ||
14 "this has ""doublequotes"" inside of it" cat | ||
15 "this is a multiline | ||
quoted description with | ||
|
||
""doublequotes"" and | ||
|
||
empty new lines and | ||
|
||
|
||
|
||
escaped quotes inside of ""it"" | ||
|
||
//and this comment with // shouldn't be ignored | ||
since it is part of the ""multiline""" bird | ||
16 here is text after the empty line dog | ||
17 this is a line with an empty animal "" | ||
0 this is a line with an empty id bird | ||
19 "" dog | ||
20 this is the last row description cat | ||
Wrote 11 rows of length 3 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
10 | ||
11 | ||
12 | ||
13 | ||
14 | ||
15 | ||
16 | ||
17 | ||
0 | ||
19 | ||
20 | ||
this is a description | ||
this is a quoted description | ||
this is a multiline\nquoted description | ||
this has one"doublequote which should be escaped as a single quote | ||
this has "doublequotes" inside of it | ||
this is a multiline\nquoted description with\n\n"doublequotes" and\n\nempty new lines and\n\n\n\nescaped quotes inside of "it"\n\n//and this comment with // shouldn't be ignored\nsince it is part of the "multiline" | ||
here is text after the empty line | ||
this is a line with an empty animal | ||
this is a line with an empty id | ||
|
||
this is the last row description | ||
dog | ||
cat | ||
bird | ||
dog | ||
cat | ||
bird | ||
dog | ||
|
||
bird | ||
dog | ||
cat |
Uh oh!
There was an error while loading. Please reload this page.