-
Notifications
You must be signed in to change notification settings - Fork 1.9k
Enable TextLoader to accept new lines in quoted fields #5125
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
e03b0e4
3ddb3b0
9b20ead
34d9efa
ac6f971
8304d62
a9e91e2
9cfaee1
2827c61
a427662
1cafbf0
6116d97
df2ca25
530f41e
d9af9d2
6a7b632
fb6ab28
f592a7f
a13b803
c2d2ac7
13033bf
2983b06
9789479
60c4169
fa28ddd
56279b4
51d9390
5cc7512
5be5f6f
b4e3029
7e16fc7
f0652a5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,6 +4,7 @@ | |
|
||
using System; | ||
using System.Collections.Generic; | ||
using System.IO; | ||
using System.Text; | ||
using System.Threading; | ||
using System.Threading.Tasks; | ||
|
@@ -145,7 +146,7 @@ public static DataViewRowCursor Create(TextLoader parent, IMultiStreamSource fil | |
SetupCursor(parent, active, 0, out srcNeeded, out cthd); | ||
Contracts.Assert(cthd > 0); | ||
|
||
var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent._maxRows, 1); | ||
var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._separators, parent._maxRows, 1); | ||
var stats = new ParseStats(parent._host, 1); | ||
return new Cursor(parent, stats, active, reader, srcNeeded, cthd); | ||
} | ||
|
@@ -162,7 +163,7 @@ public static DataViewRowCursor[] CreateSet(TextLoader parent, IMultiStreamSourc | |
SetupCursor(parent, active, n, out srcNeeded, out cthd); | ||
Contracts.Assert(cthd > 0); | ||
|
||
var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent._maxRows, cthd); | ||
var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._separators, parent._maxRows, cthd); | ||
var stats = new ParseStats(parent._host, cthd); | ||
if (cthd <= 1) | ||
return new DataViewRowCursor[1] { new Cursor(parent, stats, active, reader, srcNeeded, 1) }; | ||
|
@@ -204,7 +205,7 @@ public override ValueGetter<DataViewRowId> GetIdGetter() | |
}; | ||
} | ||
|
||
public static void GetSomeLines(IMultiStreamSource source, int count, ref List<ReadOnlyMemory<char>> lines) | ||
public static void GetSomeLines(IMultiStreamSource source, int count, bool readMultilines, char[] separators, ref List<ReadOnlyMemory<char>> lines) | ||
{ | ||
Contracts.AssertValue(source); | ||
Contracts.Assert(count > 0); | ||
|
@@ -214,7 +215,7 @@ public static void GetSomeLines(IMultiStreamSource source, int count, ref List<R | |
count = 2; | ||
|
||
LineBatch batch; | ||
var reader = new LineReader(source, count, 1, false, count, 1); | ||
var reader = new LineReader(source, count, 1, false, readMultilines, separators, count, 1); | ||
try | ||
{ | ||
batch = reader.GetBatch(); | ||
|
@@ -401,6 +402,8 @@ private sealed class LineReader | |
{ | ||
private readonly long _limit; | ||
private readonly bool _hasHeader; | ||
private readonly bool _readMultilines; | ||
private readonly char[] _separators; | ||
private readonly int _batchSize; | ||
private readonly IMultiStreamSource _files; | ||
|
||
|
@@ -410,7 +413,7 @@ private sealed class LineReader | |
private Task _thdRead; | ||
private volatile bool _abort; | ||
|
||
public LineReader(IMultiStreamSource files, int batchSize, int bufSize, bool hasHeader, long limit, int cref) | ||
public LineReader(IMultiStreamSource files, int batchSize, int bufSize, bool hasHeader, bool readMultilines, char[] separators, long limit, int cref) | ||
{ | ||
// Note that files is allowed to be empty. | ||
Contracts.AssertValue(files); | ||
|
@@ -423,6 +426,8 @@ public LineReader(IMultiStreamSource files, int batchSize, int bufSize, bool has | |
_limit = limit; | ||
_hasHeader = hasHeader; | ||
_batchSize = batchSize; | ||
_readMultilines = readMultilines; | ||
_separators = separators; | ||
_files = files; | ||
_cref = cref; | ||
|
||
|
@@ -464,6 +469,176 @@ public LineBatch GetBatch() | |
throw Contracts.ExceptDecode(batch.Exception, "Stream reading encountered exception"); | ||
} | ||
|
||
private class MultiLineReader | ||
{ | ||
private readonly char _sep0; | ||
private readonly char[] _separators; | ||
private readonly bool _sepsContainsSpace; | ||
private readonly StringBuilder _sb; | ||
private readonly TextReader _rdr; | ||
|
||
public MultiLineReader(TextReader rdr, char[] separators) | ||
{ | ||
Contracts.AssertNonEmpty(separators); | ||
_sep0 = separators[0]; | ||
_separators = separators; | ||
_sepsContainsSpace = IsSep(' '); | ||
_sb = new StringBuilder(); | ||
_rdr = rdr; | ||
} | ||
|
||
// When reading lines that contain quoted fields, the quoted fields can contain | ||
// '\n' so we we'll need to read multiple lines (multilines) to get all the fields | ||
// of a given row. | ||
public string ReadMultiLine(long lineNum, bool ignoreHashLine) | ||
{ | ||
string line; | ||
line = _rdr.ReadLine(); | ||
|
||
// if it was an empty line or if we've reached the end of file (i.e. line = null) | ||
if (string.IsNullOrEmpty(line)) | ||
return line; | ||
|
||
// In ML.NET we filter out lines beginning with // and # at the beginning of the file | ||
// Or lines beginning with // elsewhere in the file. | ||
// Thus, we don't care to check if there's a quoted multiline when the line begins with | ||
// these chars. | ||
if (line[0] == '/' && line[1] == '/') | ||
return line; | ||
if (ignoreHashLine && line[0] == '#') | ||
return line; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should these check if you're at the top of the file? I think once you've hit the first row of data, no more ignoring of rows is allowed. #Resolved There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Current behavior is that at the beginning of the file both // and # are ignored by LineReader link, but after the first row of data is hit, LineReader only ignores // link Because of this, I actually added the ignoreHashLine flag on this method, and use it on the code you've pointed to. One of the tests I've added on this PR actually shows that "// lines" are ignored correctly throughout the file. In reply to: 425748062 [](ancestors = 425748062) |
||
|
||
// Get more lines until the last field of the line doesn't contain its newline | ||
// inside a quoted field | ||
bool lastFieldIncludesNewLine = LastFieldIncludesNewLine(line, false); | ||
if (!lastFieldIncludesNewLine) | ||
return line; | ||
|
||
_sb.Clear(); | ||
_sb.Append(line); | ||
while (lastFieldIncludesNewLine) | ||
{ | ||
line = _rdr.ReadLine(); | ||
|
||
if (line == null) | ||
throw new EndOfStreamException($"A quoted field opened on line {lineNum} was never closed, and we've read to the last line in the file without finding the closing quote"); | ||
|
||
_sb.Append("\n"); | ||
_sb.Append(line); | ||
lastFieldIncludesNewLine = LastFieldIncludesNewLine(line, true); | ||
} | ||
|
||
return _sb.ToString(); | ||
} | ||
|
||
// The startsInsideQuoted parameter indicates if the last field of the previous line | ||
// ended in a quoted field which included the newline character, | ||
// if it is true, then the beginning of this line is considered to be part | ||
// of the last field of the previous line. | ||
public bool LastFieldIncludesNewLine(string line, bool startsInsideQuoted = false) | ||
{ | ||
if (line.Length == 0) | ||
return startsInsideQuoted; | ||
|
||
int ichCur = 0; | ||
int ichLim = line.Length; | ||
bool quotingError = false; | ||
|
||
bool ret = FieldIncludesNewLine(ref line, ref ichCur, ichLim, ref quotingError, startsInsideQuoted); | ||
while (ichCur < ichLim) | ||
{ | ||
ret = FieldIncludesNewLine(ref line, ref ichCur, ichLim, ref quotingError, false); | ||
if(quotingError) | ||
return false; | ||
|
||
// Skip empty fields | ||
while (ichCur < ichLim && IsSep(line[ichCur])) | ||
ichCur++; | ||
} | ||
|
||
return ret; | ||
} | ||
|
||
private bool FieldIncludesNewLine(ref string line, ref int ichCur, int ichLim, | ||
ref bool quotingError, bool startsInsideQuoted) | ||
{ | ||
if (!startsInsideQuoted && !_sepsContainsSpace) | ||
{ | ||
// Ignore leading spaces | ||
while (ichCur < ichLim && line[ichCur] == ' ') | ||
ichCur++; | ||
} | ||
|
||
if(startsInsideQuoted || line[ichCur] == '"') | ||
{ | ||
// Quoted Field Case | ||
|
||
if (!startsInsideQuoted) | ||
ichCur++; | ||
|
||
for (; ; ichCur++) | ||
{ | ||
if (ichCur >= ichLim) | ||
// We've reached the end of the line without finding the closing quote, | ||
// so next line will start on this quoted field | ||
return true; | ||
Comment on lines
+572
to
+584
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've rewritten the logic in Multilinereader, in order to be more flexible in what we accept (following this comment: #5125 (comment) ) In principle, it's the same as in previous iterations of the PR: simply read every character in the read line, and decide if we should read more lines as part of the current row. But now this code mimics the one found in TextLoader.Parser.HelperImpl.FetchNextField() to actually make a distinction between quoted and non-quoted fields. So now the behavior has changed for the following scenarios, which are now accepted by Multilinereader without much problem (although the parser might throw or log an error when processing these). Notice that these cases are actually not accepted by the RFC and can be considered invalid formats, so we actually have liberty on how we handle this, and I think the way I've handled it will cause the least negative impact for the users:
|
||
|
||
if (line[ichCur] == '"') | ||
{ | ||
if (++ichCur >= ichLim) | ||
// Last character in line was the closing quote of the field | ||
return false; | ||
|
||
if (line[ichCur] == '"') | ||
// 2 Double quotes means escaped quote | ||
continue; | ||
|
||
// If it wasn't an escaped quote, then this is supposed to be | ||
// the closing quote of the field, and there should only be spaces remaining | ||
// until the next separator. | ||
|
||
if (!_sepsContainsSpace) | ||
{ | ||
// Ignore leading spaces | ||
while (ichCur < ichLim && line[ichCur] == ' ') | ||
ichCur++; | ||
} | ||
|
||
// If there's anything else than spaces or the next separator, | ||
// this will actually be a QuotingError on the parser, so we decide that this | ||
// line contains a quoting error, and so it's not going to be considered a valid field | ||
// and the rest of the line should be ignored. | ||
if (ichCur >= ichLim || IsSep(line[ichCur])) | ||
return false; | ||
|
||
quotingError = true; | ||
return false; | ||
} | ||
} | ||
} | ||
|
||
// Unquoted field case. | ||
// An unquoted field shouldn't contain new lines | ||
while(ichCur < ichLim && !IsSep(line[ichCur])) | ||
{ | ||
ichCur++; | ||
} | ||
return false; | ||
} | ||
|
||
private bool IsSep(char ch) | ||
{ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I thought about it, but I read here that actually the most efficient way is to iterate over the chars: In reply to: 425452787 [](ancestors = 425452787) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If perf testing shows excessive time spent in this function, you could argument the existing C++/assembly intrinsics w/ a character counter. I think |
||
if (ch == _sep0) | ||
return true; | ||
for (int i = 1; i < _separators.Length; i++) | ||
{ | ||
if (ch == _separators[i]) | ||
return true; | ||
} | ||
return false; | ||
} | ||
} | ||
|
||
private void ThreadProc() | ||
{ | ||
Contracts.Assert(_batchSize >= 2); | ||
|
@@ -480,14 +655,19 @@ private void ThreadProc() | |
string path = _files.GetPathOrNull(ifile); | ||
using (var rdr = _files.OpenTextReader(ifile)) | ||
{ | ||
var multilineReader = new MultiLineReader(rdr, _separators); | ||
string text; | ||
long line = 0; | ||
for (; ; ) | ||
{ | ||
// REVIEW: Avoid allocating a string for every line. This would probably require | ||
// introducing a CharSpan type (similar to ReadOnlyMemory but based on char[] or StringBuilder) | ||
// and implementing all the necessary conversion functionality on it. See task 3871. | ||
text = rdr.ReadLine(); | ||
if (_readMultilines) | ||
text = multilineReader.ReadMultiLine(line, true); | ||
else | ||
text = rdr.ReadLine(); | ||
|
||
if (text == null) | ||
goto LNext; | ||
line++; | ||
|
@@ -514,7 +694,11 @@ private void ThreadProc() | |
if (_abort) | ||
return; | ||
|
||
text = rdr.ReadLine(); | ||
if (_readMultilines) | ||
text = multilineReader.ReadMultiLine(line, false); | ||
else | ||
text = rdr.ReadLine(); | ||
|
||
if (text == null) | ||
{ | ||
// We're done with this file. Queue the last partial batch. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1161,6 +1161,11 @@ private bool FetchNextField(ref ScanInfo scan, ReadOnlySpan<char> span) | |
scan.QuotingError = true; | ||
break; | ||
} | ||
|
||
// The logic below allow us to escape quotes (") inside quoted | ||
// fields by using doublo quotes (""). I.e. when the loader | ||
// encounters "" inside a quoted field, it will output only one " | ||
// and continue parsing the rest of the field. | ||
if (span[ichCur] == '"') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Might want to expose this character as a parameter. The other common choice is backslash escaping. #Resolved There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To implement the feature you're suggesting on this comment I do believe that we only need to change the code you've pointed to in here, so it's kind of straightforward. Right now I am going to focus on making sure this PR works well with AutoML/Modelbuilder, that TextSaver works with new lines and that we handle correctly the case of having a badly formatted input file. If I can tackle this with enough time before the next release, then I will try to implement this suggestion. In reply to: 425777440 [](ancestors = 425777440) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Change in plans, I won't address surfacing the new readMultiline option to AutoML/ModelBuilder on this PR. So now I will actually focus on the other things, including implementing this escapechar option before the next release. See #5125 (comment) In reply to: 425777440 [](ancestors = 425777440) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note to myself: I've just realized that to implement the escapechar feature, then I'll also need to change the logic in the MultilineReader, because if the escapechar isn't In reply to: 425777440 [](ancestors = 425777440) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As discussed offline, I'll address this in an upcoming PR. In reply to: 426128704 [](ancestors = 426128704,425777440) |
||
{ | ||
if (ichCur > ichRun) | ||
|
Uh oh!
There was an error while loading. Please reload this page.