dotnet · antoniovs1029 · May 19, 2020 · Dec 17, 2019 · Dec 19, 2019 · Dec 19, 2019
diff --git a/src/Microsoft.ML.AutoML/ColumnInference/ColumnInferenceApi.cs b/src/Microsoft.ML.AutoML/ColumnInference/ColumnInferenceApi.cs
@@ -56,7 +56,8 @@ public static ColumnInferenceResults InferColumns(MLContext context, string path
                 AllowSparse = splitInference.AllowSparse,
                 AllowQuoting = splitInference.AllowQuote,
                 HasHeader = hasHeader,
-                TrimWhitespace = trimWhitespace
+                TrimWhitespace = trimWhitespace,
+                ReadMultilines = true // MYTODO: is it ok to hardcode this? it is necessary for my test to pass
             };
             var textLoader = context.Data.CreateTextLoader(typedLoaderOptions);
             var dataView = textLoader.Load(path);
@@ -92,7 +93,8 @@ public static ColumnInferenceResults InferColumns(MLContext context, string path
                 AllowSparse = splitInference.AllowSparse,
                 Separators = new char[] { splitInference.Separator.Value },
                 HasHeader = hasHeader,
-                TrimWhitespace = trimWhitespace
+                TrimWhitespace = trimWhitespace,
+                ReadMultilines = true // is it necessary to put this in here?
             };
 
             return new ColumnInferenceResults()

diff --git a/src/Microsoft.ML.AutoML/ColumnInference/TextFileContents.cs b/src/Microsoft.ML.AutoML/ColumnInference/TextFileContents.cs
@@ -66,7 +66,8 @@ from _sep in separatorCandidates
                     } },
                     Separators = new[] { perm._sep },
                     AllowQuoting = perm._allowQuote,
-                    AllowSparse = perm._allowSparse
+                    AllowSparse = perm._allowSparse,
+                    ReadMultilines = true //MYTODO: is it ok to hard code this in here? it's necessary for the test to pass
                 };
 
                 if (TryParseFile(context, options, source, out result))

diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs
@@ -501,6 +501,13 @@ public class Options
             [Argument(ArgumentType.AtMostOnce, HelpText = "Use separate parsing threads?", ShortName = "threads", Hide = true)]
             public bool UseThreads = true;
 
+            /// <summary>
+            /// If true, new line characters are acceptable inside a quoted field, and thus one field can have multiple lines of text inside it
+            /// If <see cref="TextLoader.Options.AllowQuoting"/> is false, this option is ignored.
+            /// </summary>
+            [Argument(ArgumentType.AtMostOnce, HelpText = "Escape new line characters inside a quoted field? If AllowQuoting is false, this argument is ignored.", ShortName = "multilines", Hide = true)]
+            public bool ReadMultilines = false;
+
             /// <summary>
             /// File containing a header with feature names. If specified, the header defined in the data file is ignored regardless of <see cref="HasHeader"/>.
             /// </summary>
@@ -530,6 +537,7 @@ internal static class Defaults
             internal const char Separator = '\t';
             internal const bool HasHeader = false;
             internal const bool TrimWhitespace = false;
+            internal const bool ReadMultilines = false;
         }
 
         /// <summary>
@@ -694,11 +702,11 @@ public Bindings(TextLoader parent, Column[] cols, IMultiStreamSource headerFile,
                     ch.Assert(0 <= inputSize & inputSize < SrcLim);
                     List<ReadOnlyMemory<char>> lines = null;
                     if (headerFile != null)
-                        Cursor.GetSomeLines(headerFile, 1, ref lines);
+                        Cursor.GetSomeLines(headerFile, 1, false, ref lines);
                     if (needInputSize && inputSize == 0)
-                        Cursor.GetSomeLines(dataSample, 100, ref lines);
+                        Cursor.GetSomeLines(dataSample, 100, parent._readMultilines,ref lines);
                     else if (headerFile == null && parent.HasHeader)
-                        Cursor.GetSomeLines(dataSample, 1, ref lines);
+                        Cursor.GetSomeLines(dataSample, 1, false, ref lines);
 
                     if (needInputSize && inputSize == 0)
                     {
@@ -1081,6 +1089,7 @@ private enum OptionFlags : uint
         private const int SrcLim = int.MaxValue;
 
         private readonly bool _useThreads;
+        private readonly bool _readMultilines;
         private readonly OptionFlags _flags;
         private readonly long _maxRows;
         // Input size is zero for unknown - determined by the data (including sparse rows).
@@ -1138,6 +1147,7 @@ internal TextLoader(IHostEnvironment env, Options options = null, IMultiStreamSo
             _host.Assert(Utils.Size(cols) > 0);
 
             _useThreads = options.UseThreads;
+            _readMultilines = options.AllowQuoting ? options.ReadMultilines : false;
 
             if (options.TrimWhitespace)
                 _flags |= OptionFlags.TrimWhitespace;
@@ -1350,6 +1360,8 @@ private TextLoader(IHost host, ModelLoadContext ctx)
 
             // REVIEW: Should we serialize this? It really isn't part of the data model.
             _useThreads = true;
+            // MYTODO: Should we serialize this? probably yes and also include it in Flags
+            _readMultilines = false;
 
             // *** Binary format ***
             // int: sizeof(Float)
@@ -1458,6 +1470,7 @@ internal static TextLoader CreateTextLoader<TInput>(IHostEnvironment host,
            bool allowQuoting = Defaults.AllowQuoting,
            bool supportSparse = Defaults.AllowSparse,
            bool trimWhitespace = Defaults.TrimWhitespace,
+           bool readMultilines = Defaults.ReadMultilines,
            IMultiStreamSource dataSample = null)
         {
             var userType = typeof(TInput);
@@ -1527,7 +1540,8 @@ internal static TextLoader CreateTextLoader<TInput>(IHostEnvironment host,
                 AllowQuoting = allowQuoting,
                 AllowSparse = supportSparse,
                 TrimWhitespace = trimWhitespace,
-                Columns = columns.ToArray()
+                Columns = columns.ToArray(),
+                ReadMultilines = readMultilines
             };
 
             return new TextLoader(host, options, dataSample: dataSample);

diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderCursor.cs
@@ -4,6 +4,7 @@
 
 using System;
 using System.Collections.Generic;
+using System.IO;
 using System.Text;
 using System.Threading;
 using System.Threading.Tasks;
@@ -145,7 +146,7 @@ public static DataViewRowCursor Create(TextLoader parent, IMultiStreamSource fil
                 SetupCursor(parent, active, 0, out srcNeeded, out cthd);
                 Contracts.Assert(cthd > 0);
 
-                var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent._maxRows, 1);
+                var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent._readMultilines, parent._maxRows, 1);
                 var stats = new ParseStats(parent._host, 1);
                 return new Cursor(parent, stats, active, reader, srcNeeded, cthd);
             }
@@ -162,7 +163,7 @@ public static DataViewRowCursor[] CreateSet(TextLoader parent, IMultiStreamSourc
                 SetupCursor(parent, active, n, out srcNeeded, out cthd);
                 Contracts.Assert(cthd > 0);
 
-                var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent._maxRows, cthd);
+                var reader = new LineReader(files, BatchSize, 100, parent.HasHeader, parent._readMultilines, parent._maxRows, cthd);
                 var stats = new ParseStats(parent._host, cthd);
                 if (cthd <= 1)
                     return new DataViewRowCursor[1] { new Cursor(parent, stats, active, reader, srcNeeded, 1) };
@@ -204,7 +205,7 @@ public override ValueGetter<DataViewRowId> GetIdGetter()
                     };
             }
 
-            public static void GetSomeLines(IMultiStreamSource source, int count, ref List<ReadOnlyMemory<char>> lines)
+            public static void GetSomeLines(IMultiStreamSource source, int count, bool readMultilines, ref List<ReadOnlyMemory<char>> lines)
             {
                 Contracts.AssertValue(source);
                 Contracts.Assert(count > 0);
@@ -214,7 +215,7 @@ public static void GetSomeLines(IMultiStreamSource source, int count, ref List<R
                     count = 2;
 
                 LineBatch batch;
-                var reader = new LineReader(source, count, 1, false, count, 1);
+                var reader = new LineReader(source, count, 1, false, readMultilines, count, 1);
                 try
                 {
                     batch = reader.GetBatch();
@@ -401,6 +402,7 @@ private sealed class LineReader
             {
                 private readonly long _limit;
                 private readonly bool _hasHeader;
+                private readonly bool _readMultilines;
                 private readonly int _batchSize;
                 private readonly IMultiStreamSource _files;
 
@@ -410,7 +412,7 @@ private sealed class LineReader
                 private Task _thdRead;
                 private volatile bool _abort;
 
-                public LineReader(IMultiStreamSource files, int batchSize, int bufSize, bool hasHeader, long limit, int cref)
+                public LineReader(IMultiStreamSource files, int batchSize, int bufSize, bool hasHeader, bool readMultilines, long limit, int cref)
                 {
                     // Note that files is allowed to be empty.
                     Contracts.AssertValue(files);
@@ -423,6 +425,7 @@ public LineReader(IMultiStreamSource files, int batchSize, int bufSize, bool has
                     _limit = limit;
                     _hasHeader = hasHeader;
                     _batchSize = batchSize;
+                    _readMultilines = readMultilines;
                     _files = files;
                     _cref = cref;
 
@@ -464,9 +467,69 @@ public LineBatch GetBatch()
                     throw Contracts.ExceptDecode(batch.Exception, "Stream reading encountered exception");
                 }
 
+                private static class MultiLineReader
+                {
+                    // When reading lines that contain quoted fields, the quoted fields can contain
+                    // '\n' so we we'll need to read multiple lines (multilines) to get all the fields
+                    // of a given row.
+                    public static string ReadMultiLine(TextReader sr, StringBuilder sb, bool ignoreHashLine)
+                    {
+                        string line;
+                        line = sr.ReadLine();
+
+                        // if it was an empty line or if we've reached the end of file (i.e. line = null)
+                        if (string.IsNullOrEmpty(line))
+                            return line;
+
+                        // In ML.NET we filter out lines beginning with // and # at the beginning of the file
+                        // Or lines beginning with // elsewhere in the file.
+                        // Thus, we don't care to check if there's a quoted multiline when the line begins with
+                        // these chars.
+                        if (line[0] == '/' && line[1] == '/')
+                            return line;
+                        if (ignoreHashLine && line[0] == '#')
+                            return line;
+
+                        // Get more lines until the number of quote characters is even
+                        // 2 consecutive quotes are considered scaped quotes
+                        long numOfQuotes = GetNumberOfChars(line, '"');
+                        if (numOfQuotes % 2 == 0)
+                            return line;
+
+                        sb.Clear();
+                        sb.Append(line);
+                        while (numOfQuotes % 2 != 0)
+                        {
+                            line = sr.ReadLine();
+
+                            if (line == null) // If we've reached the end of the file
+                                break; // MYTODO: This could happen if we have an invalid open quote which never closes so we reach the end of the file without properly closing the field, should we throw instead in this case?
+
+                            if(line.Length != 0)
+                                sb.Append(" "); // MYTODO: should we use instead a "\n" in here to separate lines?
+
+                            sb.Append(line);
+                            numOfQuotes += GetNumberOfChars(line, '"');
+                        }
+
+                        return sb.ToString();
+                    }
+
+                    public static int GetNumberOfChars(string line, char ch)
+                    {
+                        int count = 0;
+                        foreach (char c in line)
+                        {
+                            if (c == ch) count++;
+                        }
+                        return count;
+                    }
+                }
+
                 private void ThreadProc()
                 {
                     Contracts.Assert(_batchSize >= 2);
+                    var multilineSB = new StringBuilder();
 
                     try
                     {
@@ -487,7 +550,11 @@ private void ThreadProc()
                                     // REVIEW: Avoid allocating a string for every line. This would probably require
                                     // introducing a CharSpan type (similar to ReadOnlyMemory but based on char[] or StringBuilder)
                                     // and implementing all the necessary conversion functionality on it. See task 3871.
-                                    text = rdr.ReadLine();
+                                    if (_readMultilines)
+                                        text = MultiLineReader.ReadMultiLine(rdr, multilineSB, true);
+                                    else
+                                        text = rdr.ReadLine();
+
                                     if (text == null)
                                         goto LNext;
                                     line++;
@@ -514,7 +581,11 @@ private void ThreadProc()
                                     if (_abort)
                                         return;
 
-                                    text = rdr.ReadLine();
+                                    if (_readMultilines)
+                                        text = MultiLineReader.ReadMultiLine(rdr, multilineSB, true);
+                                    else
+                                        text = rdr.ReadLine();
+
                                     if (text == null)
                                     {
                                         // We're done with this file. Queue the last partial batch.

diff --git a/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs b/src/Microsoft.ML.Data/DataLoadSave/Text/TextLoaderParser.cs
@@ -1161,6 +1161,11 @@ private bool FetchNextField(ref ScanInfo scan, ReadOnlySpan<char> span)
                                 scan.QuotingError = true;
                                 break;
                             }
+
+                            // The logic below allow us to scape quotes (") inside quoted
+                            // fields by using doublo quotes (""). I.e. when the loader
+                            // encounters "" inside a quoted field, it will output only one "
+                            // and continue parsing the rest of the field.
                             if (span[ichCur] == '"')
                             {
                                 if (ichCur > ichRun)