dotnet · zeahmed · Jul 26, 2018 · Jul 17, 2018 · Jul 19, 2018 · Jul 19, 2018
diff --git a/src/Microsoft.ML.Transforms/Text/CharTokenizeTransform.cs b/src/Microsoft.ML.Transforms/Text/CharTokenizeTransform.cs
@@ -64,12 +64,16 @@ public sealed class Arguments : TransformInputBase
         public const string LoaderSignature = "CharToken";
         public const string UserName = "Character Tokenizer Transform";
 
+        // Keep track of the model that was saved with ver:0x00010001
+        private readonly bool _isSeparatorStartEnd;
+
         private static VersionInfo GetVersionInfo()
         {
             return new VersionInfo(
                 modelSignature: "CHARTOKN",
-                verWrittenCur: 0x00010001, // Initial
-                verReadableCur: 0x00010001,
+                //verWrittenCur: 0x00010001, // Initial
+                verWrittenCur: 0x00010002,  // Updated to use UnitSeparator <US> character instead of using <ETX><STX> for vector inputs.
+                verReadableCur: 0x00010002,
                 verWeCanReadBack: 0x00010001,
                 loaderSignature: LoaderSignature);
         }
@@ -84,6 +88,7 @@ private static VersionInfo GetVersionInfo()
         private volatile string _keyValuesStr;
         private volatile int[] _keyValuesBoundaries;
 
+        private const ushort UnitSeparator = 0x1f;
         private const ushort TextStartMarker = 0x02;
         private const ushort TextEndMarker = 0x03;
         private const int TextMarkersCount = 2;
@@ -102,6 +107,7 @@ public CharTokenizeTransform(IHostEnvironment env, Arguments args, IDataView inp
 
             _type = GetOutputColumnType();
             SetMetadata();
+            _isSeparatorStartEnd = false;
         }
 
         private static ColumnType GetOutputColumnType()
@@ -120,6 +126,8 @@ private CharTokenizeTransform(IHost host, ModelLoadContext ctx, IDataView input)
             // byte: _useMarkerChars value.
             _useMarkerChars = ctx.Reader.ReadBoolByte();
 
+            _isSeparatorStartEnd = ctx.Header.ModelVerReadable < 0x00010002 || ctx.Reader.ReadBoolByte();
+
             _type = GetOutputColumnType();
             SetMetadata();
         }
@@ -145,6 +153,7 @@ public override void Save(ModelSaveContext ctx)
             // byte: _useMarkerChars value.
             SaveBase(ctx);
             ctx.Writer.WriteBoolByte(_useMarkerChars);
+            ctx.Writer.WriteBoolByte(_isSeparatorStartEnd);
         }
 
         protected override ColumnType GetColumnTypeCore(int iinfo)
@@ -399,8 +408,8 @@ private ValueGetter<VBuffer<ushort>> MakeGetterVec(IRow input, int iinfo)
 
             var getSrc = GetSrcGetter<VBuffer<DvText>>(input, iinfo);
             var src = default(VBuffer<DvText>);
-            return
-                (ref VBuffer<ushort> dst) =>
+
+            ValueGetter<VBuffer<ushort>> getterWithStartEndSep = (ref VBuffer<ushort> dst) =>
                 {
                     getSrc(ref src);
 
@@ -438,6 +447,67 @@ private ValueGetter<VBuffer<ushort>> MakeGetterVec(IRow input, int iinfo)
 
                     dst = new VBuffer<ushort>(len, values, dst.Indices);
                 };
+
+            ValueGetter < VBuffer<ushort> > getterWithUnitSep = (ref VBuffer<ushort> dst) =>
+                {
+                    getSrc(ref src);
+
+                    int len = 0;
+
+                    for (int i = 0; i < src.Count; i++)
+                    {
+                        if (src.Values[i].HasChars)
+                        {
+                            len += src.Values[i].Length;
+
+                            if (i > 0)
+                                len += 1;  // add UnitSeparator character to len that will be added
+                        }
+                    }
+
+                    if (_useMarkerChars)
+                        len += TextMarkersCount;
+
+                    var values = dst.Values;
+                    if (len > 0)
+                    {
+                        if (Utils.Size(values) < len)
+                            values = new ushort[len];
+
+                        int index = 0;
+
+                        // VBuffer<DvText> can be a result of either concatenating text columns together
+                        // or application of word tokenizer before char tokenizer in TextTransform.
+                        //
+                        // Considering VBuffer<DvText> as a single text stream.
+                        // Therefore, prepend and append start and end markers only once i.e. at the start and at end of vector.
+                        // Insert UnitSeparator after every piece of text in the vector.
+                        if (_useMarkerChars)
+                            values[index++] = TextStartMarker;
+
+                        for (int i = 0; i < src.Count; i++)
+                        {
+                            if (!src.Values[i].HasChars)
+                                continue;
+
+                            if (i > 0)
+                                values[index++] = UnitSeparator;
+
+                            for (int ich = 0; ich < src.Values[i].Length; ich++)
+                            {
+                                values[index++] = src.Values[i][ich];
+                            }
+                        }
+
+                        if (_useMarkerChars)
+                            values[index++] = TextEndMarker;
+
+                        Contracts.Assert(index == len);
+                    }
+
+                    dst = new VBuffer<ushort>(len, values, dst.Indices);
+                };
+            return _isSeparatorStartEnd ? getterWithStartEndSep : getterWithUnitSep;
         }
     }
 }
diff --git a/src/Microsoft.ML.Transforms/Text/TextTransform.cs b/src/Microsoft.ML.Transforms/Text/TextTransform.cs
@@ -262,6 +262,30 @@ public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataV
                 view = new ConcatTransform(h, new ConcatTransform.Arguments() { Column = xfCols }, view);
             }
 
+            if (tparams.NeedsNormalizeTransform)
+            {
+                var xfCols = new TextNormalizerCol[textCols.Length];
+                string[] dstCols = new string[textCols.Length];
+                for (int i = 0; i < textCols.Length; i++)
+                {
+                    dstCols[i] = GenerateColumnName(view.Schema, textCols[i], "TextNormalizer");
+                    tempCols.Add(dstCols[i]);
+                    xfCols[i] = new TextNormalizerCol() { Source = textCols[i], Name = dstCols[i] };
+                }
+
+                view = new TextNormalizerTransform(h,
+                    new TextNormalizerArgs()
+                    {
+                        Column = xfCols,
+                        KeepDiacritics = tparams.KeepDiacritics,
+                        KeepNumbers = tparams.KeepNumbers,
+                        KeepPunctuations = tparams.KeepPunctuations,
+                        TextCase = tparams.TextCase
+                    }, view);
+
+                textCols = dstCols;
+            }
+
             if (tparams.NeedsWordTokenizationTransform)
             {
                 var xfCols = new DelimitedTokenizeTransform.Column[textCols.Length];
@@ -281,34 +305,6 @@ public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataV
                 view = new DelimitedTokenizeTransform(h, new DelimitedTokenizeTransform.Arguments() { Column = xfCols }, view);
             }
 
-            if (tparams.NeedsNormalizeTransform)
-            {
-                string[] srcCols = wordTokCols == null ? textCols : wordTokCols;
-                var xfCols = new TextNormalizerCol[srcCols.Length];
-                string[] dstCols = new string[srcCols.Length];
-                for (int i = 0; i < srcCols.Length; i++)
-                {
-                    dstCols[i] = GenerateColumnName(view.Schema, srcCols[i], "TextNormalizer");
-                    tempCols.Add(dstCols[i]);
-                    xfCols[i] = new TextNormalizerCol() { Source = srcCols[i], Name = dstCols[i] };
-                }
-
-                view = new TextNormalizerTransform(h,
-                    new TextNormalizerArgs()
-                    {
-                        Column = xfCols,
-                        KeepDiacritics = tparams.KeepDiacritics,
-                        KeepNumbers = tparams.KeepNumbers,
-                        KeepPunctuations = tparams.KeepPunctuations,
-                        TextCase = tparams.TextCase
-                    }, view);
-
-                if (wordTokCols != null)
-                    wordTokCols = dstCols;
-                else
-                    textCols = dstCols;
-            }
-
             if (tparams.NeedsRemoveStopwordsTransform)
             {
                 Contracts.Assert(wordTokCols != null, "StopWords transform requires that word tokenization has been applied to the input text.");
@@ -360,7 +356,7 @@ public static IDataTransform Create(IHostEnvironment env, Arguments args, IDataV
             if (tparams.CharExtractorFactory != null)
             {
                 {
-                    var srcCols = wordTokCols ?? textCols;
+                    var srcCols = tparams.NeedsRemoveStopwordsTransform ? wordTokCols : textCols;
                     charTokCols = new string[srcCols.Length];
                     var xfCols = new CharTokenizeTransform.Column[srcCols.Length];
                     for (int i = 0; i < srcCols.Length; i++)

diff --git a/test/Microsoft.ML.Predictor.Tests/TestPipelineSweeper.cs b/test/Microsoft.ML.Predictor.Tests/TestPipelineSweeper.cs
@@ -308,7 +308,7 @@ public void PipelineSweeperRoles()
             var trainAuc = bestPipeline.PerformanceSummary.TrainingMetricValue;
             var testAuc = bestPipeline.PerformanceSummary.MetricValue;
             Assert.True((0.94 < trainAuc) && (trainAuc < 0.95));
-            Assert.True((0.83 < testAuc) && (testAuc < 0.84));
+            Assert.True((0.815 < testAuc) && (testAuc < 0.825));
 
             var results = runner.GetOutput<IDataView>("ResultsOut");
             Assert.NotNull(results);