dotnet · antoniovs1029 · May 19, 2020 · Dec 17, 2019 · Dec 19, 2019 · Dec 19, 2019
diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs
@@ -493,14 +493,21 @@ public class Options
             /// </summary>
             [Argument(ArgumentType.AtMostOnce, ShortName = "header",
                 HelpText = "Data file has header with feature names. Header is read only if options 'hs' and 'hf' are not specified.")]
-            public bool HasHeader;
+            public bool HasHeader = Defaults.HasHeader;
 
             /// <summary>
             /// Whether to use separate parsing threads.
             /// </summary>
             [Argument(ArgumentType.AtMostOnce, HelpText = "Use separate parsing threads?", ShortName = "threads", Hide = true)]
             public bool UseThreads = true;
 
+            /// <summary>
+            /// If true, new line characters are acceptable inside a quoted field, and thus one field can have multiple lines of text inside it
+            /// If <see cref="TextLoader.Options.AllowQuoting"/> is false, this option is ignored.
+            /// </summary>
+            [Argument(ArgumentType.AtMostOnce, HelpText = "Escape new line characters inside a quoted field? If AllowQuoting is false, this argument is ignored.", ShortName = "multilines", Hide = true)]
+            public bool ReadMultilines = Defaults.ReadMultilines;
+
             /// <summary>
             /// File containing a header with feature names. If specified, the header defined in the data file is ignored regardless of <see cref="HasHeader"/>.
             /// </summary>
@@ -530,6 +537,7 @@ internal static class Defaults
             internal const char Separator = '\t';
             internal const bool HasHeader = false;
             internal const bool TrimWhitespace = false;
+            internal const bool ReadMultilines = false;
         }
 
         /// <summary>
@@ -694,11 +702,11 @@ public Bindings(TextLoader parent, Column[] cols, IMultiStreamSource headerFile,
                     ch.Assert(0 <= inputSize & inputSize < SrcLim);
                     List<ReadOnlyMemory<char>> lines = null;
                     if (headerFile != null)
-                        Cursor.GetSomeLines(headerFile, 1, ref lines);
+                        Cursor.GetSomeLines(headerFile, 1, parent.ReadMultilines, parent._separators, ref lines);
                     if (needInputSize && inputSize == 0)
-                        Cursor.GetSomeLines(dataSample, 100, ref lines);
+                        Cursor.GetSomeLines(dataSample, 100, parent.ReadMultilines, parent._separators, ref lines);
                     else if (headerFile == null && parent.HasHeader)
-                        Cursor.GetSomeLines(dataSample, 1, ref lines);
+                        Cursor.GetSomeLines(dataSample, 1, parent.ReadMultilines, parent._separators, ref lines);
 
                     if (needInputSize && inputSize == 0)
                     {
@@ -1055,7 +1063,7 @@ private static VersionInfo GetVersionInfo()
                 // verWrittenCur: 0x00010009, // Introduced _flags
                 //verWrittenCur: 0x0001000A, // Added ForceVector in Range
                 //verWrittenCur: 0x0001000B, // Header now retained if used and present
-                verWrittenCur: 0x0001000C, // Removed Min and Contiguous from KeyType
+                verWrittenCur: 0x0001000C, // Removed Min and Contiguous from KeyType, and added ReadMultilines flag to OptionFlags
                 verReadableCur: 0x0001000A,
                 verWeCanReadBack: 0x00010009,
                 loaderSignature: LoaderSignature,
@@ -1073,8 +1081,8 @@ private enum OptionFlags : uint
             HasHeader = 0x02,
             AllowQuoting = 0x04,
             AllowSparse = 0x08,
-
-            All = TrimWhitespace | HasHeader | AllowQuoting | AllowSparse
+            ReadMultilines = 0x10,
+            All = TrimWhitespace | HasHeader | AllowQuoting | AllowSparse | ReadMultilines
         }
 
         // This is reserved to mean the range extends to the end (the segment is variable).
@@ -1095,6 +1103,11 @@ private bool HasHeader
             get { return (_flags & OptionFlags.HasHeader) != 0; }
         }
 
+        private bool ReadMultilines
+        {
+            get { return (_flags & OptionFlags.ReadMultilines) != 0; }
+        }
+
         private readonly IHost _host;
         private const string RegistrationName = "TextLoader";
 
@@ -1147,6 +1160,8 @@ internal TextLoader(IHostEnvironment env, Options options = null, IMultiStreamSo
                 _flags |= OptionFlags.AllowQuoting;
             if (options.AllowSparse)
                 _flags |= OptionFlags.AllowSparse;
+            if (options.AllowQuoting && options.ReadMultilines)
+                _flags |= OptionFlags.ReadMultilines;
 
             // REVIEW: This should be persisted (if it should be maintained).
             _maxRows = options.MaxRows ?? long.MaxValue;

diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs
@@ -4,6 +4,7 @@
 
 using System;
 using System.Collections.Generic;
+using System.IO;
 using System.Text;
 using System.Threading;
 using System.Threading.Tasks;
@@ -145,7 +146,7 @@ public static DataViewRowCursor Create(TextLoader parent, IMultiStreamSource fil
                 SetupCursor(parent, active, 0, out srcNeeded, out cthd);
                 Contracts.Assert(cthd > 0);
 
-                var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent._maxRows, 1);
+                var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._separators, parent._maxRows, 1);
                 var stats = new ParseStats(parent._host, 1);
                 return new Cursor(parent, stats, active, reader, srcNeeded, cthd);
             }
@@ -162,7 +163,7 @@ public static DataViewRowCursor[] CreateSet(TextLoader parent, IMultiStreamSourc
                 SetupCursor(parent, active, n, out srcNeeded, out cthd);
                 Contracts.Assert(cthd > 0);
 
-                var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent._maxRows, cthd);
+                var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent.ReadMultilines, parent._separators, parent._maxRows, cthd);
                 var stats = new ParseStats(parent._host, cthd);
                 if (cthd <= 1)
                     return new DataViewRowCursor[1] { new Cursor(parent, stats, active, reader, srcNeeded, 1) };
@@ -204,7 +205,7 @@ public override ValueGetter<DataViewRowId> GetIdGetter()
                     };
             }
 
-            public static void GetSomeLines(IMultiStreamSource source, int count, ref List<ReadOnlyMemory<char>> lines)
+            public static void GetSomeLines(IMultiStreamSource source, int count, bool readMultilines, char[] separators, ref List<ReadOnlyMemory<char>> lines)
             {
                 Contracts.AssertValue(source);
                 Contracts.Assert(count > 0);
@@ -214,7 +215,7 @@ public static void GetSomeLines(IMultiStreamSource source, int count, ref List<R
                     count = 2;
 
                 LineBatch batch;
-                var reader = new LineReader(source, count, 1, false, count, 1);
+                var reader = new LineReader(source, count, 1, false, readMultilines, separators, count, 1);
                 try
                 {
                     batch = reader.GetBatch();
@@ -401,6 +402,8 @@ private sealed class LineReader
             {
                 private readonly long _limit;
                 private readonly bool _hasHeader;
+                private readonly bool _readMultilines;
+                private readonly char[] _separators;
                 private readonly int _batchSize;
                 private readonly IMultiStreamSource _files;
 
@@ -410,7 +413,7 @@ private sealed class LineReader
                 private Task _thdRead;
                 private volatile bool _abort;
 
-                public LineReader(IMultiStreamSource files, int batchSize, int bufSize, bool hasHeader, long limit, int cref)
+                public LineReader(IMultiStreamSource files, int batchSize, int bufSize, bool hasHeader, bool readMultilines, char[] separators, long limit, int cref)
                 {
                     // Note that files is allowed to be empty.
                     Contracts.AssertValue(files);
@@ -423,6 +426,8 @@ public LineReader(IMultiStreamSource files, int batchSize, int bufSize, bool has
                     _limit = limit;
                     _hasHeader = hasHeader;
                     _batchSize = batchSize;
+                    _readMultilines = readMultilines;
+                    _separators = separators;
                     _files = files;
                     _cref = cref;
 
@@ -464,6 +469,176 @@ public LineBatch GetBatch()
                     throw Contracts.ExceptDecode(batch.Exception, "Stream reading encountered exception");
                 }
 
+                private class MultiLineReader
+                {
+                    private readonly char _sep0;
+                    private readonly char[] _separators;
+                    private readonly bool _sepsContainsSpace;
+                    private readonly StringBuilder _sb;
+                    private readonly TextReader _rdr;
+
+                    public MultiLineReader(TextReader rdr, char[] separators)
+                    {
+                        Contracts.AssertNonEmpty(separators);
+                        _sep0 = separators[0];
+                        _separators = separators;
+                        _sepsContainsSpace = IsSep(' ');
+                        _sb = new StringBuilder();
+                        _rdr = rdr;
+                    }
+
+                    // When reading lines that contain quoted fields, the quoted fields can contain
+                    // '\n' so we we'll need to read multiple lines (multilines) to get all the fields
+                    // of a given row.
+                    public string ReadMultiLine(long lineNum, bool ignoreHashLine)
+                    {
+                        string line;
+                        line = _rdr.ReadLine();
+
+                        // if it was an empty line or if we've reached the end of file (i.e. line = null)
+                        if (string.IsNullOrEmpty(line))
+                            return line;
+
+                        // In ML.NET we filter out lines beginning with // and # at the beginning of the file
+                        // Or lines beginning with // elsewhere in the file.
+                        // Thus, we don't care to check if there's a quoted multiline when the line begins with
+                        // these chars.
+                        if (line[0] == '/' && line[1] == '/')
+                            return line;
+                        if (ignoreHashLine && line[0] == '#')
+                            return line;
+
+                        // Get more lines until the last field of the line doesn't contain its newline
+                        // inside a quoted field
+                        bool lastFieldIncludesNewLine = LastFieldIncludesNewLine(line, false);
+                        if (!lastFieldIncludesNewLine)
+                            return line;
+
+                        _sb.Clear();
+                        _sb.Append(line);
+                        while (lastFieldIncludesNewLine)
+                        {
+                            line = _rdr.ReadLine();
+
+                            if (line == null)
+                                throw new EndOfStreamException($"A quoted field opened on line {lineNum} was never closed, and we've read to the last line in the file without finding the closing quote");
+
+                            _sb.Append("\n");
+                            _sb.Append(line);
+                            lastFieldIncludesNewLine = LastFieldIncludesNewLine(line, true);
+                        }
+
+                        return _sb.ToString();
+                    }
+
+                    // The startsInsideQuoted parameter indicates if the last field of the previous line
+                    // ended in a quoted field which included the newline character,
+                    // if it is true, then the beginning of this line is considered to be part
+                    // of the last field of the previous line.
+                    public bool LastFieldIncludesNewLine(string line, bool startsInsideQuoted = false)
+                    {
+                        if (line.Length == 0)
+                            return startsInsideQuoted;
+
+                        int ichCur = 0;
+                        int ichLim = line.Length;
+                        bool quotingError = false;
+
+                        bool ret = FieldIncludesNewLine(ref line, ref ichCur, ichLim, ref quotingError, startsInsideQuoted);
+                        while (ichCur < ichLim)
+                        {
+                            ret = FieldIncludesNewLine(ref line, ref ichCur, ichLim, ref quotingError, false);
+                            if(quotingError)
+                                return false;
+
+                            // Skip empty fields
+                            while (ichCur < ichLim && IsSep(line[ichCur]))
+                                ichCur++;
+                        }
+
+                        return ret;
+                    }
+
+                    private bool FieldIncludesNewLine(ref string line, ref int ichCur, int ichLim,
+                        ref bool quotingError, bool startsInsideQuoted)
+                    {
+                        if (!startsInsideQuoted && !_sepsContainsSpace)
+                        {
+                            // Ignore leading spaces
+                            while (ichCur < ichLim && line[ichCur] == ' ')
+                                ichCur++;
+                        }
+
+                        if(startsInsideQuoted || line[ichCur] == '"')
+                        {
+                            // Quoted Field Case
+
+                            if (!startsInsideQuoted)
+                                ichCur++;
+
+                            for (; ; ichCur++)
+                            {
+                                if (ichCur >= ichLim)
+                                    // We've reached the end of the line without finding the closing quote,
+                                    // so next line will start on this quoted field
+                                    return true;
+
+                                if (line[ichCur] == '"')
+                                {
+                                    if (++ichCur >= ichLim)
+                                        // Last character in line was the closing quote of the field
+                                        return false;
+
+                                    if (line[ichCur] == '"')
+                                        // 2 Double quotes means escaped quote
+                                        continue;
+
+                                    // If it wasn't an escaped quote, then this is supposed to be
+                                    // the closing quote of the field, and there should only be spaces remaining
+                                    // until the next separator.
+
+                                    if (!_sepsContainsSpace)
+                                    {
+                                        // Ignore leading spaces
+                                        while (ichCur < ichLim && line[ichCur] == ' ')
+                                            ichCur++;
+                                    }
+
+                                    // If there's anything else than spaces or the next separator,
+                                    // this will actually be a QuotingError on the parser, so we decide that this
+                                    // line contains a quoting error, and so it's not going to be considered a valid field
+                                    // and the rest of the line should be ignored.
+                                    if (ichCur >= ichLim || IsSep(line[ichCur]))
+                                        return false;
+
+                                    quotingError = true;
+                                    return false;
+                                }
+                            }
+                        }
+
+                        // Unquoted field case.
+                        // An unquoted field shouldn't contain new lines
+                        while(ichCur < ichLim && !IsSep(line[ichCur]))
+                        {
+                            ichCur++;
+                        }
+                        return false;
+                    }
+
+                    private bool IsSep(char ch)
+                    {
+                        if (ch == _sep0)
+                            return true;
+                        for (int i = 1; i < _separators.Length; i++)
+                        {
+                            if (ch == _separators[i])
+                                return true;
+                        }
+                        return false;
+                    }
+                }
+
                 private void ThreadProc()
                 {
                     Contracts.Assert(_batchSize >= 2);
@@ -480,14 +655,19 @@ private void ThreadProc()
                             string path = _files.GetPathOrNull(ifile);
                             using (var rdr = _files.OpenTextReader(ifile))
                             {
+                                var multilineReader = new MultiLineReader(rdr, _separators);
                                 string text;
                                 long line = 0;
                                 for (; ; )
                                 {
                                     // REVIEW: Avoid allocating a string for every line. This would probably require
                                     // introducing a CharSpan type (similar to ReadOnlyMemory but based on char[] or StringBuilder)
                                     // and implementing all the necessary conversion functionality on it. See task 3871.
-                                    text = rdr.ReadLine();
+                                    if (_readMultilines)
+                                        text = multilineReader.ReadMultiLine(line, true);
+                                    else
+                                        text = rdr.ReadLine();
+
                                     if (text == null)
                                         goto LNext;
                                     line++;
@@ -514,7 +694,11 @@ private void ThreadProc()
                                     if (_abort)
                                         return;
 
-                                    text = rdr.ReadLine();
+                                    if (_readMultilines)
+                                        text = multilineReader.ReadMultiLine(line, false);
+                                    else
+                                        text = rdr.ReadLine();
+
                                     if (text == null)
                                     {
                                         // We're done with this file. Queue the last partial batch.

diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs
@@ -1161,6 +1161,11 @@ private bool FetchNextField(ref ScanInfo scan, ReadOnlySpan<char> span)
                                 scan.QuotingError = true;
                                 break;
                             }
+
+                            // The logic below allow us to escape quotes (") inside quoted
+                            // fields by using doublo quotes (""). I.e. when the loader
+                            // encounters "" inside a quoted field, it will output only one "
+                            // and continue parsing the rest of the field.
                             if (span[ichCur] == '"')
                             {
                                 if (ichCur > ichRun)

diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderSaverCatalog.cs
@@ -64,7 +64,7 @@ public static TextLoader CreateTextLoader(this DataOperationsCatalog catalog,
                 HasHeader = hasHeader,
                 AllowQuoting = allowQuoting,
                 TrimWhitespace = trimWhitespace,
-                AllowSparse = allowSparse
+                AllowSparse = allowSparse,
             };
 
             return new TextLoader(CatalogUtils.GetEnvironment(catalog), options: options, dataSample: dataSample);